From 32fad0c32e947c3139004b4f32d6b16e8ed7b2df Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 08:20:12 +0100 Subject: [PATCH 01/12] :art: format with latest black version --- .gitignore | 5 +- pimmslearn/__init__.py | 9 +- pimmslearn/analyzers/__init__.py | 6 +- pimmslearn/analyzers/analyzers.py | 355 +++++++++++------- pimmslearn/analyzers/compare_predictions.py | 36 +- pimmslearn/analyzers/diff_analysis.py | 16 +- .../setup_diff_analysis_website.py | 61 +-- .../cmd_interface/setup_imp_cp_website.py | 60 +-- pimmslearn/data_handling.py | 28 +- pimmslearn/databases/diseases.py | 18 +- pimmslearn/filter.py | 9 +- pimmslearn/imputation.py | 53 ++- pimmslearn/io/__init__.py | 63 ++-- pimmslearn/io/dataloaders.py | 37 +- pimmslearn/io/datasets.py | 38 +- pimmslearn/io/datasplits.py | 130 ++++--- pimmslearn/io/format.py | 18 +- pimmslearn/io/load.py | 89 +++-- pimmslearn/logging.py | 18 +- pimmslearn/model.py | 19 +- pimmslearn/models/__init__.py | 182 +++++---- pimmslearn/models/ae.py | 124 +++--- pimmslearn/models/analysis.py | 1 + pimmslearn/models/collab.py | 54 +-- pimmslearn/models/collect_dumps.py | 36 +- pimmslearn/models/vae.py | 69 ++-- pimmslearn/nb.py | 34 +- pimmslearn/normalization.py | 11 +- pimmslearn/pandas/__init__.py | 110 +++--- pimmslearn/pandas/calc_errors.py | 23 +- pimmslearn/pandas/missing_data.py | 47 ++- pimmslearn/plotting/__init__.py | 273 ++++++++------ pimmslearn/plotting/data.py | 322 ++++++++-------- pimmslearn/plotting/defaults.py | 65 ++-- pimmslearn/plotting/errors.py | 190 ++++++---- pimmslearn/plotting/plotly.py | 9 +- pimmslearn/sampling.py | 87 ++--- pimmslearn/sklearn/__init__.py | 1 + pimmslearn/sklearn/ae_transformer.py | 84 +++-- pimmslearn/sklearn/cf_transformer.py | 95 ++--- pimmslearn/transform.py | 29 +- pimmslearn/utils.py | 50 +-- 42 files changed, 1651 insertions(+), 1313 deletions(-) diff --git a/.gitignore b/.gitignore index 1eef316c3..1472aff0a 100644 --- a/.gitignore +++ b/.gitignore @@ -45,4 +45,7 @@ workflows/maxquant/out/ # builds docs/_* docs/reference -build \ No newline at end of file +build + +#Mac +*.DS_Store diff --git a/pimmslearn/__init__.py b/pimmslearn/__init__.py index 262462099..09a681e4f 100644 --- a/pimmslearn/__init__.py +++ b/pimmslearn/__init__.py @@ -9,6 +9,7 @@ PyPI package is called pimms-learn (with a hyphen). """ + from __future__ import annotations # Set default logging handler to avoid "No handler found" warnings. @@ -31,14 +32,14 @@ savefig = pimmslearn.plotting.savefig -__license__ = 'GPLv3' +__license__ = "GPLv3" __version__ = metadata.version("pimms-learn") -__all__ = ['logging', 'nb', 'pandas', 'plotting', 'savefig'] +__all__ = ["logging", "nb", "pandas", "plotting", "savefig"] # set some defaults -njab.pandas.set_pandas_number_formatting(float_format='{:,.3f}') +njab.pandas.set_pandas_number_formatting(float_format="{:,.3f}") -pimmslearn.plotting.make_large_descriptors('x-large') +pimmslearn.plotting.make_large_descriptors("x-large") diff --git a/pimmslearn/analyzers/__init__.py b/pimmslearn/analyzers/__init__.py index 1c1944993..0709b9724 100644 --- a/pimmslearn/analyzers/__init__.py +++ b/pimmslearn/analyzers/__init__.py @@ -1,10 +1,10 @@ -"""General classes formalizing an experiment. -""" +"""General classes formalizing an experiment.""" + from types import SimpleNamespace from pimmslearn.analyzers import compare_predictions, diff_analysis -__all__ = ['diff_analysis', 'compare_predictions', 'Analysis'] +__all__ = ["diff_analysis", "compare_predictions", "Analysis"] class Analysis(SimpleNamespace): diff --git a/pimmslearn/analyzers/analyzers.py b/pimmslearn/analyzers/analyzers.py index 29c69b283..68931a08b 100644 --- a/pimmslearn/analyzers/analyzers.py +++ b/pimmslearn/analyzers/analyzers.py @@ -21,7 +21,7 @@ logger = logging.getLogger(__name__) -__doc__ = 'A collection of Analyzers to perform certain type of analysis.' +__doc__ = "A collection of Analyzers to perform certain type of analysis." ALPHA = 0.5 @@ -42,13 +42,19 @@ class AnalyzePeptides(SimpleNamespace): Many more attributes are set dynamically depending on the concrete analysis. """ - def __init__(self, data: pd.DataFrame, - is_log_transformed: bool = False, - is_wide_format: bool = True, ind_unstack: str = '',): + def __init__( + self, + data: pd.DataFrame, + is_log_transformed: bool = False, + is_wide_format: bool = True, + ind_unstack: str = "", + ): if not is_wide_format: if not ind_unstack: - raise ValueError("Please specify index level for unstacking via " - f"'ind_unstack' from: {data.index.names}") + raise ValueError( + "Please specify index level for unstacking via " + f"'ind_unstack' from: {data.index.names}" + ) data = data.unstack(ind_unstack) is_wide_format = True self.df = data # assume wide @@ -59,39 +65,52 @@ def __init__(self, data: pd.DataFrame, self.index_col = self.df.index.name @classmethod - def from_csv(cls, fname: str, - nrows: int = None, - # could be potentially 0 for the first column - index_col: Union[int, str, List] = 'Sample ID', - verify_fname: bool = False, - usecols=None, - **kwargs): - df = pd.read_csv(fname, index_col=index_col, low_memory=False, - nrows=nrows, usecols=usecols).squeeze('columns') + def from_csv( + cls, + fname: str, + nrows: int = None, + # could be potentially 0 for the first column + index_col: Union[int, str, List] = "Sample ID", + verify_fname: bool = False, + usecols=None, + **kwargs, + ): + df = pd.read_csv( + fname, index_col=index_col, low_memory=False, nrows=nrows, usecols=usecols + ).squeeze("columns") if len(df.shape) == 1: # unstack all but first column df = df.unstack(df.index.names[1:]) - verify_df(df=df, fname=fname, - index_col=index_col, - verify_fname=verify_fname, - usecols=usecols) + verify_df( + df=df, + fname=fname, + index_col=index_col, + verify_fname=verify_fname, + usecols=usecols, + ) return cls(data=df, **kwargs) # all __init__ parameters are kwargs @classmethod # @delegates(from_csv) # does only include parameters with defaults - def from_pickle(cls, fname: str, - # could be potentially 0 for the first column - index_col: Union[int, str, List] = 'Sample ID', - verify_fname: bool = False, - usecols=None, - **kwargs): + def from_pickle( + cls, + fname: str, + # could be potentially 0 for the first column + index_col: Union[int, str, List] = "Sample ID", + verify_fname: bool = False, + usecols=None, + **kwargs, + ): df = pd.read_pickle(fname).squeeze() if len(df.shape) == 1: df = df.unstack(df.index.names[1:]) - verify_df(df=df, fname=fname, - index_col=index_col, - verify_fname=verify_fname, - usecols=usecols) + verify_df( + df=df, + fname=fname, + index_col=index_col, + verify_fname=verify_fname, + usecols=usecols, + ) return cls(data=df, **kwargs) # all __init__ parameters are kwargs def get_consecutive_dates(self, n_samples, seed=42): @@ -106,7 +125,7 @@ def get_consecutive_dates(self, n_samples, seed=42): if seed: random.seed(42) - _attr_name = f'df_{n_samples}' + _attr_name = f"df_{n_samples}" setattr(self, _attr_name, get_consecutive_data_indices(self.df, n_samples)) print("Training data referenced unter:", _attr_name) self.df = getattr(self, _attr_name) @@ -115,15 +134,18 @@ def get_consecutive_dates(self, n_samples, seed=42): @property def df_long(self): - if hasattr(self, '_df_long'): + if hasattr(self, "_df_long"): return self._df_long - return self.to_long_format(colname_values='intensity', index_name=self.index_col) + return self.to_long_format( + colname_values="intensity", index_name=self.index_col + ) def to_long_format( - self, - colname_values: str = 'intensity', - index_name: str = 'Sample ID', - inplace: str = False) -> pd.DataFrame: + self, + colname_values: str = "intensity", + index_name: str = "Sample ID", + inplace: str = False, + ) -> pd.DataFrame: """[summary] Parameters @@ -143,7 +165,7 @@ def to_long_format( """Build long data view.""" if not self.is_wide_format: return self.df - if hasattr(self, '_df_long'): + if hasattr(self, "_df_long"): return self._df_long # rm attribute to overwrite df_long = long_format( @@ -164,10 +186,11 @@ def df_wide(self): return self.to_wide_format() def to_wide_format( - self, - columns: str = 'Sample ID', - name_values: str = 'intensity', - inplace: bool = False) -> pd.DataFrame: + self, + columns: str = "Sample ID", + name_values: str = "intensity", + inplace: bool = False, + ) -> pd.DataFrame: """[summary] Parameters @@ -195,7 +218,7 @@ def to_wide_format( if self.is_wide_format: return self.df - if hasattr(self, '_df_long'): + if hasattr(self, "_df_long"): df = self._df_long else: df = self.df @@ -216,7 +239,7 @@ def describe_peptides(self, sample_n: int = None): else: df = self.df stats = df.describe() - stats.loc['CV'] = stats.loc['std'] / stats.loc['mean'] + stats.loc["CV"] = stats.loc["std"] / stats.loc["mean"] self.stats.peptides = stats return stats @@ -231,10 +254,10 @@ def get_PCA(self, n_components=2, imputer=SimpleImputer): assert all(X.notna()) PCs, self.pca_ = run_pca(X, n_components=n_components) - if not hasattr(self, 'df_meta'): + if not hasattr(self, "df_meta"): logger.warning('No metadata available, please set "df_meta" first.') try: - PCs['ms_instrument'] = self.df_meta['ms_instrument'].astype('category') + PCs["ms_instrument"] = self.df_meta["ms_instrument"].astype("category") except KeyError: logger.warning("No MS instrument added.") except AttributeError: @@ -250,47 +273,60 @@ def calculate_PCs(self, new_df, is_wide=True): X = _add_indices(X, new_df) PCs = self.pca_.transform(X) PCs = _add_indices(PCs, new_df, index_only=True) - PCs.columns = [f'PC {i+1}' for i in range(PCs.shape[-1])] + PCs.columns = [f"PC {i+1}" for i in range(PCs.shape[-1])] return PCs - def plot_pca(self,): + def plot_pca( + self, + ): """Create principal component plot with three heatmaps showing instrument, degree of non NA data and sample by date.""" if not self.is_wide_format: self.df = self.df.unstack(self.df.index.names[1:]) self.is_wide_format = True - if not hasattr(self, 'df_meta'): + if not hasattr(self, "df_meta"): raise AttributeError('No metadata available, please set "df_meta" first.') PCs = self.get_PCA() - fig, axes = plt.subplots(nrows=3, ncols=1, figsize=( - 15, 20), constrained_layout=True) + fig, axes = plt.subplots( + nrows=3, ncols=1, figsize=(15, 20), constrained_layout=True + ) - Dim = namedtuple('DimensionsData', 'N M') + Dim = namedtuple("DimensionsData", "N M") self.dim = Dim(*self.df.shape) fig.suptitle( - f'First two Principal Components of {self.dim.M} most abundant peptides \n for {self.dim.N} samples', - fontsize=30) + f"First two Principal Components of {self.dim.M} most abundant peptides \n for {self.dim.N} samples", + fontsize=30, + ) # by instrument ax = axes[0] - seaborn_scatter(df=PCs.iloc[:, :2], fig=fig, ax=ax, - meta=PCs['ms_instrument'], title='by MS instrument') - ax.legend(loc='center right', bbox_to_anchor=(1.11, 0.5)) + seaborn_scatter( + df=PCs.iloc[:, :2], + fig=fig, + ax=ax, + meta=PCs["ms_instrument"], + title="by MS instrument", + ) + ax.legend(loc="center right", bbox_to_anchor=(1.11, 0.5)) # by complettness/missingness # continues colormap will be a bit trickier using seaborn: https://stackoverflow.com/a/44642014/9684872 ax = axes[1] - plot_scatter(df=PCs.iloc[:, :2], fig=fig, ax=ax, - meta=self.df_meta['prop_not_na'], title='by number on na') + plot_scatter( + df=PCs.iloc[:, :2], + fig=fig, + ax=ax, + meta=self.df_meta["prop_not_na"], + title="by number on na", + ) # by dates ax = axes[2] - plot_date_map(df=PCs.iloc[:, :2], - ax=ax, dates=self.df_meta.date) + plot_date_map(df=PCs.iloc[:, :2], ax=ax, dates=self.df_meta.date) return fig @@ -309,7 +345,8 @@ def log_transform(self, log_fct: np.ufunc): """ if self.is_log_transformed: raise Exception( - f'Data was already log transformed, using {self.__class__.__name__}.log_fct: {self.log_fct}') + f"Data was already log transformed, using {self.__class__.__name__}.log_fct: {self.log_fct}" + ) else: self.df = log_fct(self.df) self.is_log_transformed = True @@ -323,11 +360,14 @@ def get_dectection_limit(self): str Information on detection limit """ - self.detection_limit = self.df.min().min() if self.is_log_transformed else np.log10( - self.df).min().min() # all zeros become nan. + self.detection_limit = ( + self.df.min().min() + if self.is_log_transformed + else np.log10(self.df).min().min() + ) # all zeros become nan. return "Detection limit: {:6.3f}, corresponding to intensity value of {:,d}".format( - self.detection_limit, - int(10 ** self.detection_limit)) + self.detection_limit, int(10**self.detection_limit) + ) def __repr__(self): keys = sorted(self.__dict__) @@ -339,20 +379,27 @@ def __repr__(self): @property def fname_stub(self): - assert hasattr(self, 'df'), f'Attribute df is missing: {self}' - return 'N{:05d}_M{:05d}'.format(*self.df.shape) + assert hasattr(self, "df"), f"Attribute df is missing: {self}" + return "N{:05d}_M{:05d}".format(*self.df.shape) class LatentAnalysis(Analysis): - def __init__(self, latent_space: pd.DataFrame, meta_data: pd.DataFrame, model_name: str, - fig_size: Tuple[int, int] = (15, 15), folder: Path = None): + def __init__( + self, + latent_space: pd.DataFrame, + meta_data: pd.DataFrame, + model_name: str, + fig_size: Tuple[int, int] = (15, 15), + folder: Path = None, + ): self.latent_space, self.meta_data = latent_space, meta_data self.fig_size, self.folder = fig_size, folder self.model_name = model_name - self.folder = Path(self.folder) if self.folder else Path('.') - assert len( - self.latent_space.shape) == 2, "Expected a two dimensional DataFrame." + self.folder = Path(self.folder) if self.folder else Path(".") + assert ( + len(self.latent_space.shape) == 2 + ), "Expected a two dimensional DataFrame." self.latent_dim = self.latent_space.shape[-1] if self.latent_dim > 2: # pca, add option for different methods @@ -360,7 +407,7 @@ def __init__(self, latent_space: pd.DataFrame, meta_data: pd.DataFrame, model_na else: self.latent_reduced = self.latent_space - def plot_by_date(self, meta_key: str = 'date', save: bool = True): + def plot_by_date(self, meta_key: str = "date", save: bool = True): fig, ax = self._plot(fct=plot_date_map, meta_key=meta_key, save=save) return fig, ax @@ -372,15 +419,21 @@ def _plot(self, fct, meta_key: str, save: bool = True): try: meta_data = self.meta_data[meta_key] except KeyError: - raise ValueError(f"Requested key: '{meta_key}' is not in available," - f" use: {', '.join(x for x in self.meta_data.columns)}") + raise ValueError( + f"Requested key: '{meta_key}' is not in available," + f" use: {', '.join(x for x in self.meta_data.columns)}" + ) fig, ax = plt.subplots(figsize=self.fig_size) - _ = fct(df=self.latent_reduced, ax=ax, - meta=meta_data.loc[self.latent_reduced.index], - title=f'{self.model_name} latent space PCA of {self.latent_dim} dimensions by {meta_key}') + _ = fct( + df=self.latent_reduced, + ax=ax, + meta=meta_data.loc[self.latent_reduced.index], + title=f"{self.model_name} latent space PCA of {self.latent_dim} dimensions by {meta_key}", + ) if save: - pimmslearn.plotting._savefig(fig, name=f'{self.model_name}_latent_by_{meta_key}', - folder=self.folder) + pimmslearn.plotting._savefig( + fig, name=f"{self.model_name}_latent_by_{meta_key}", folder=self.folder + ) return fig, ax @@ -392,103 +445,117 @@ def get_consecutive_data_indices(df, n_samples): index = df.sort_index().index start_sample = len(index) - n_samples start_sample = random.randint(0, start_sample) - return df.loc[index[start_sample:start_sample + n_samples]] + return df.loc[index[start_sample : start_sample + n_samples]] def corr_lower_triangle(df, **kwargs): - """Compute the correlation matrix, returning only unique values. - """ + """Compute the correlation matrix, returning only unique values.""" corr_df = df.corr(**kwargs) - lower_triangle = pd.DataFrame( - np.tril(np.ones(corr_df.shape), -1)).astype(bool) + lower_triangle = pd.DataFrame(np.tril(np.ones(corr_df.shape), -1)).astype(bool) lower_triangle.index, lower_triangle.columns = corr_df.index, corr_df.columns return corr_df.where(lower_triangle) def plot_corr_histogram(corr_lower_triangle, bins=10): - fig, axes = plt.subplots(ncols=2, gridspec_kw={"width_ratios": [ - 5, 1], "wspace": 0.2}, figsize=(8, 4)) + fig, axes = plt.subplots( + ncols=2, gridspec_kw={"width_ratios": [5, 1], "wspace": 0.2}, figsize=(8, 4) + ) values = pd.Series(corr_lower_triangle.to_numpy().flatten()).dropna() ax = axes[0] ax = values.hist(ax=ax, bins=bins) ax.yaxis.set_major_formatter("{x:,.0f}") ax = axes[1] - plt.axis('off') + plt.axis("off") data = values.describe(percentiles=np.linspace(0.1, 1, 10)).round(2) - data.name = '' + data.name = "" _ = pd.plotting.table(ax=ax, data=data, loc="best", edges="open") return fig, axes -def plot_date_map(df, ax, - dates: pd.Series = None, - meta: pd.Series = None, - title: str = 'by date', - fontsize=8, - size=2): +def plot_date_map( + df, + ax, + dates: pd.Series = None, + meta: pd.Series = None, + title: str = "by date", + fontsize=8, + size=2, +): if dates is not None and meta is not None: raise ValueError("Only set either dates or meta parameters.") # ToDo: Clean up arguments if dates is None: dates = meta cols = list(df.columns) - assert len(cols) == 2, f'Please provide two dimensons, not {df.columns}' + assert len(cols) == 2, f"Please provide two dimensons, not {df.columns}" ax.set_title(title, fontsize=fontsize) ax.set_xlabel(cols[0]) ax.set_ylabel(cols[1]) - path_collection = scatter_plot_w_dates( - ax, df, dates=dates, errors='raise') + path_collection = scatter_plot_w_dates(ax, df, dates=dates, errors="raise") _ = add_date_colorbar(path_collection, ax=ax) -def plot_scatter(df, ax, - meta: pd.Series, - feat_name_display: str = 'features', - title: Optional[str] = None, - alpha=ALPHA, - fontsize=8, - size=2): +def plot_scatter( + df, + ax, + meta: pd.Series, + feat_name_display: str = "features", + title: Optional[str] = None, + alpha=ALPHA, + fontsize=8, + size=2, +): cols = list(df.columns) - assert len(cols) == 2, f'Please provide two dimensons, not {df.columns}' + assert len(cols) == 2, f"Please provide two dimensons, not {df.columns}" if not title: - title = f'by identified {feat_name_display}' + title = f"by identified {feat_name_display}" ax.set_title(title, fontsize=fontsize) ax.set_xlabel(cols[0]) ax.set_ylabel(cols[1]) path_collection = ax.scatter( - x=cols[0], y=cols[1], s=size, c=meta, data=df, alpha=alpha) - _ = ax.get_figure().colorbar(path_collection, ax=ax, - label=f'Identified {feat_name_display}', - # ticklocation='left', # ignored by matplotlib - location='right', # ! left does not put colobar without overlapping y ticks - format="{x:,.0f}", - ) - - -def seaborn_scatter(df, ax, - meta: pd.Series, - title: str = 'by some metadata', - alpha=ALPHA, - fontsize=5, - size=5): + x=cols[0], y=cols[1], s=size, c=meta, data=df, alpha=alpha + ) + _ = ax.get_figure().colorbar( + path_collection, + ax=ax, + label=f"Identified {feat_name_display}", + # ticklocation='left', # ignored by matplotlib + location="right", # ! left does not put colobar without overlapping y ticks + format="{x:,.0f}", + ) + + +def seaborn_scatter( + df, + ax, + meta: pd.Series, + title: str = "by some metadata", + alpha=ALPHA, + fontsize=5, + size=5, +): cols = list(df.columns) - assert len(cols) == 2, f'Please provide two dimensons, not {df.columns}' - seaborn.scatterplot(x=df[cols[0]], y=df[cols[1]], - hue=meta, ax=ax, palette='deep', s=size, alpha=alpha) - _ = ax.legend(fontsize=fontsize, - title_fontsize=fontsize, - markerscale=0.4, - title=meta.name, - ) + assert len(cols) == 2, f"Please provide two dimensons, not {df.columns}" + seaborn.scatterplot( + x=df[cols[0]], + y=df[cols[1]], + hue=meta, + ax=ax, + palette="deep", + s=size, + alpha=alpha, + ) + _ = ax.legend( + fontsize=fontsize, + title_fontsize=fontsize, + markerscale=0.4, + title=meta.name, + ) ax.set_title(title, fontsize=fontsize) return ax -def scatter_plot_w_dates(ax, df, - dates=None, - marker=None, - errors='raise', - size=2): +def scatter_plot_w_dates(ax, df, dates=None, marker=None, errors="raise", size=2): """plot first vs. second column in DataFrame. Use dates to color data. @@ -504,13 +571,16 @@ def scatter_plot_w_dates(ax, df, cols = df.columns if isinstance(dates, str): - dates = df['dates'] + dates = df["dates"] path_collection = ax.scatter( x=df[cols[0]], y=df[cols[1]], - c=[mdates.date2num(t) for t in pd.to_datetime(dates, errors=errors) - ] if dates is not None else None, + c=( + [mdates.date2num(t) for t in pd.to_datetime(dates, errors=errors)] + if dates is not None + else None + ), alpha=ALPHA, s=size, marker=marker, @@ -520,8 +590,9 @@ def scatter_plot_w_dates(ax, df, def add_date_colorbar(mappable, ax): loc = mdates.AutoDateLocator() - cbar = ax.get_figure().colorbar(mappable, ax=ax, ticks=loc, - format=mdates.AutoDateFormatter(loc)) + cbar = ax.get_figure().colorbar( + mappable, ax=ax, ticks=loc, format=mdates.AutoDateFormatter(loc) + ) return cbar @@ -538,5 +609,5 @@ def cast_object_to_category(df: pd.DataFrame) -> pd.DataFrame: pd.DataFrame DataFrame with category columns instead of object columns. """ - _columns = df.select_dtypes(include='object').columns - return df.astype({col: 'category' for col in _columns}) + _columns = df.select_dtypes(include="object").columns + return df.astype({col: "category" for col in _columns}) diff --git a/pimmslearn/analyzers/compare_predictions.py b/pimmslearn/analyzers/compare_predictions.py index e73a44d48..4b40c1036 100644 --- a/pimmslearn/analyzers/compare_predictions.py +++ b/pimmslearn/analyzers/compare_predictions.py @@ -6,7 +6,7 @@ import pandas as pd -def load_predictions(pred_files: List, shared_columns=['observed']): +def load_predictions(pred_files: List, shared_columns=["observed"]): pred_files = iter(pred_files) fname = next(pred_files) @@ -15,20 +15,27 @@ def load_predictions(pred_files: List, shared_columns=['observed']): for fname in pred_files: _pred_file = pd.read_csv(fname, index_col=[0, 1]) idx_shared = pred.index.intersection(_pred_file.index) - assert len(idx_shared), f'No shared index between already loaded models {pred.columns} and {fname}' + assert len( + idx_shared + ), f"No shared index between already loaded models {pred.columns} and {fname}" if shared_columns: - assert all(pred.loc[idx_shared, shared_columns] == _pred_file.loc[idx_shared, shared_columns]) + assert all( + pred.loc[idx_shared, shared_columns] + == _pred_file.loc[idx_shared, shared_columns] + ) pred = pred.join(_pred_file.drop(shared_columns, axis=1)) else: pred = pred.join(_pred_file) return pred -def load_split_prediction_by_modelkey(experiment_folder: Path, - split: str, - model_keys: list[str], - allow_missing=False, - shared_columns: list[str] = None): +def load_split_prediction_by_modelkey( + experiment_folder: Path, + split: str, + model_keys: list[str], + allow_missing=False, + shared_columns: list[str] = None, +): """Load predictions from a list of models. Parameters @@ -49,22 +56,25 @@ def load_split_prediction_by_modelkey(experiment_folder: Path, pd.DataFrame Prediction data frame with shared columns and model predictions """ - pred_files = [experiment_folder / 'preds' / - f'pred_{split}_{key}.csv' for key in model_keys] + pred_files = [ + experiment_folder / "preds" / f"pred_{split}_{key}.csv" for key in model_keys + ] to_remove = list() for file in pred_files: if not file.exists(): if allow_missing: - print(f'WARNING: {file} does not exist') + print(f"WARNING: {file} does not exist") to_remove.append(file) else: - raise FileNotFoundError(f'{file} does not exist') + raise FileNotFoundError(f"{file} does not exist") if to_remove: pred_files.remove(to_remove) return load_predictions(pred_files, shared_columns=shared_columns) -def load_single_csv_pred_file(fname: str | Path, value_name: str = 'intensity') -> pd.Series: +def load_single_csv_pred_file( + fname: str | Path, value_name: str = "intensity" +) -> pd.Series: """Load a single pred file from a single model. Last column are measurments, other are index. diff --git a/pimmslearn/analyzers/diff_analysis.py b/pimmslearn/analyzers/diff_analysis.py index 6b115f6ba..270c75565 100644 --- a/pimmslearn/analyzers/diff_analysis.py +++ b/pimmslearn/analyzers/diff_analysis.py @@ -7,24 +7,24 @@ logger = logging.getLogger() -Cutoffs = namedtuple('Cutoffs', 'feat_completness_over_samples min_feat_in_sample') +Cutoffs = namedtuple("Cutoffs", "feat_completness_over_samples min_feat_in_sample") -def select_raw_data(df: pd.DataFrame, - data_completeness: float, - frac_protein_groups: int) -> tuple[pd.DataFrame, Cutoffs]: - msg = 'N samples: {}, M feat: {}' +def select_raw_data( + df: pd.DataFrame, data_completeness: float, frac_protein_groups: int +) -> tuple[pd.DataFrame, Cutoffs]: + msg = "N samples: {}, M feat: {}" N, M = df.shape logger.info("Initally: " + msg.format(N, M)) min_sample_for_feat = int(N * data_completeness) df = df.dropna(axis=1, thresh=min_sample_for_feat) logger.info( - f"Dropped features quantified in less than {int(min_sample_for_feat)} samples.") + f"Dropped features quantified in less than {int(min_sample_for_feat)} samples." + ) N, M = df.shape logger.info("After feat selection: " + msg.format(N, M)) min_feat_per_sample = int(M * frac_protein_groups) - logger.info( - f"Min No. of Protein-Groups in single sample: {min_feat_per_sample}") + logger.info(f"Min No. of Protein-Groups in single sample: {min_feat_per_sample}") df = df.dropna(axis=0, thresh=min_feat_per_sample) logger.info("Finally: " + msg.format(*df.shape)) diff --git a/pimmslearn/cmd_interface/setup_diff_analysis_website.py b/pimmslearn/cmd_interface/setup_diff_analysis_website.py index c2f983847..4965cf5cf 100644 --- a/pimmslearn/cmd_interface/setup_diff_analysis_website.py +++ b/pimmslearn/cmd_interface/setup_diff_analysis_website.py @@ -1,4 +1,5 @@ """Console script to create or append index.rst for static website of differential analysis workflow.""" + import argparse import textwrap from collections import defaultdict @@ -6,7 +7,7 @@ def split_nb_name(nb: str) -> list: - return nb.split('.')[0].split('_') + return nb.split(".")[0].split("_") INDEX_RST = textwrap.dedent("""\ @@ -43,56 +44,60 @@ def split_nb_name(nb: str) -> list: def main(): parser = argparse.ArgumentParser( - description='Create or append index.rst for static website ' - 'displaying differential analysis notebooks.') - parser.add_argument('--folder', '-f', - type=str, - help='Path to the folder', - required=True) - parser.add_argument('--subfolder_comparision', '-sf_cp', - type=str, - help='Subfolder for comparison', - required=True) + description="Create or append index.rst for static website " + "displaying differential analysis notebooks." + ) + parser.add_argument( + "--folder", "-f", type=str, help="Path to the folder", required=True + ) + parser.add_argument( + "--subfolder_comparision", + "-sf_cp", + type=str, + help="Subfolder for comparison", + required=True, + ) args = parser.parse_args() folder_experiment = args.folder folder_experiment = Path(folder_experiment) subfolder_comparison = Path(args.subfolder_comparision) - nbs = [_f.relative_to(folder_experiment) for _f in subfolder_comparison.glob('**/*.ipynb') if _f.is_file()] + nbs = [ + _f.relative_to(folder_experiment) + for _f in subfolder_comparison.glob("**/*.ipynb") + if _f.is_file() + ] nbs groups = defaultdict(list) for nb in nbs: - _group = nb.name.split('_')[1] + _group = nb.name.split("_")[1] groups[_group].append(nb) groups = dict(groups) groups # Parse notebooks present in imputation workflow - nb_1 = '' - for nb in groups['1']: + nb_1 = "" + for nb in groups["1"]: nb_1 += " " * 4 + split_nb_name(nb.name)[-1] + f" <{nb.as_posix()}>\n" - nb_2 = '' - for nb in groups['2']: - nb_2 += " " * 4 + ' '.join(nb.parent.name.split('_')) + f" <{nb.as_posix()}>\n" + nb_2 = "" + for nb in groups["2"]: + nb_2 += " " * 4 + " ".join(nb.parent.name.split("_")) + f" <{nb.as_posix()}>\n" - nb_3 = '' - for nb in groups['3']: - nb_3 += " " * 4 + ' '.join(nb.parent.name.split('_')) + f" <{nb.as_posix()}>\n" + nb_3 = "" + for nb in groups["3"]: + nb_3 += " " * 4 + " ".join(nb.parent.name.split("_")) + f" <{nb.as_posix()}>\n" print(nb_3) - nb_4 = groups['4'][0] + nb_4 = groups["4"][0] nb_4 = " " * 4 + "Compare single features" + f" <{nb_4.as_posix()}>\n" - index_rst = INDEX_RST.format(nb_1=nb_1, - nb_2=nb_2, - nb_3=nb_3, - nb_4=nb_4) + index_rst = INDEX_RST.format(nb_1=nb_1, nb_2=nb_2, nb_3=nb_3, nb_4=nb_4) # append to index.rst - with open(folder_experiment / 'index.rst', 'a') as f: + with open(folder_experiment / "index.rst", "a") as f: f.write(index_rst) msg = f"""\ @@ -106,5 +111,5 @@ def main(): print(msg) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/pimmslearn/cmd_interface/setup_imp_cp_website.py b/pimmslearn/cmd_interface/setup_imp_cp_website.py index 0f4c8ebd9..42c7d2b81 100644 --- a/pimmslearn/cmd_interface/setup_imp_cp_website.py +++ b/pimmslearn/cmd_interface/setup_imp_cp_website.py @@ -1,5 +1,6 @@ """Console script to create index.rst and conf.py for static website of the imputation comparison workflow.""" + import argparse import textwrap from collections import defaultdict @@ -7,7 +8,7 @@ def split_nb_name(nb: str) -> list: - return nb.split('.')[0].split('_') + return nb.split(".")[0].split("_") INDEX_RST = textwrap.dedent("""\ @@ -116,59 +117,60 @@ def split_nb_name(nb: str) -> list: def main(): parser = argparse.ArgumentParser( - description='Create index.rst and conf.py for static website ' - 'of the imputation comparison workflow.') - parser.add_argument('--folder', '-f', - type=str, - help='Path to the folder', - required=True) + description="Create index.rst and conf.py for static website " + "of the imputation comparison workflow." + ) + parser.add_argument( + "--folder", "-f", type=str, help="Path to the folder", required=True + ) args = parser.parse_args() folder_experiment = args.folder folder_experiment = Path(folder_experiment) - nbs = [_f.name for _f in folder_experiment.iterdir() if _f.suffix == '.ipynb'] + nbs = [_f.name for _f in folder_experiment.iterdir() if _f.suffix == ".ipynb"] nbs groups = defaultdict(list) for nb in nbs: - _group = nb.split('_')[1] + _group = nb.split("_")[1] groups[_group].append(nb) groups = dict(groups) groups # Parse notebooks present in imputation workflow - nb_0 = '' - for nb in groups['0']: + nb_0 = "" + for nb in groups["0"]: nb_0 += " " * 4 + f"{nb}\n" - nb_1_PIMMS = '' - for nb in groups['1']: - if '_NAGuideR_' not in nb: + nb_1_PIMMS = "" + for nb in groups["1"]: + if "_NAGuideR_" not in nb: nb_1_PIMMS += " " * 4 + split_nb_name(nb)[-1] + f" <{nb}>\n" - nb_1_NAGuideR = '' - for nb in groups['1']: - if '_NAGuideR_' in nb: + nb_1_NAGuideR = "" + for nb in groups["1"]: + if "_NAGuideR_" in nb: _model = split_nb_name(nb)[-1] if _model.isupper(): nb_1_NAGuideR += " " * 4 + _model + f" <{nb}>\n" else: - nb_1_NAGuideR += " " * 4 + ' '.join(split_nb_name(nb[5:])) + f" <{nb}>\n" + nb_1_NAGuideR += ( + " " * 4 + " ".join(split_nb_name(nb[5:])) + f" <{nb}>\n" + ) - nb_2 = '' - for nb in groups['2']: - nb_2 += " " * 4 + ' '.join(split_nb_name(nb[5:])) + f" <{nb}>\n" + nb_2 = "" + for nb in groups["2"]: + nb_2 += " " * 4 + " ".join(split_nb_name(nb[5:])) + f" <{nb}>\n" - index_rst = INDEX_RST.format(nb_0=nb_0, - nb_1_PIMMS=nb_1_PIMMS, - nb_1_NAGuideR=nb_1_NAGuideR, - nb_2=nb_2) + index_rst = INDEX_RST.format( + nb_0=nb_0, nb_1_PIMMS=nb_1_PIMMS, nb_1_NAGuideR=nb_1_NAGuideR, nb_2=nb_2 + ) # write to file and print further instructions - with open(folder_experiment / 'index.rst', 'w') as f: + with open(folder_experiment / "index.rst", "w") as f: f.write(index_rst) - with open(folder_experiment / 'conf.py', 'w') as f: + with open(folder_experiment / "conf.py", "w") as f: f.write(CONF_PY) msg = f"""\ @@ -206,9 +208,9 @@ def main(): msg = textwrap.dedent(msg) print(msg) - with open(folder_experiment / 'README.md', 'w') as f: + with open(folder_experiment / "README.md", "w") as f: f.write(msg) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/pimmslearn/data_handling.py b/pimmslearn/data_handling.py index 41be078ac..66cbe6c68 100644 --- a/pimmslearn/data_handling.py +++ b/pimmslearn/data_handling.py @@ -1,6 +1,7 @@ """ Functionality to handle protein and peptide datasets. """ + import numpy as np import pandas as pd @@ -16,29 +17,32 @@ def coverage(X: pd.DataFrame, coverage_col: float, coverage_row: float): return df -def compute_stats_missing(X: pd.DataFrame, - col_no_missing: str = 'no_missing', - col_no_identified: str = 'no_identified', - col_prop_samples: str = 'prop_samples') -> pd.DataFrame: +def compute_stats_missing( + X: pd.DataFrame, + col_no_missing: str = "no_missing", + col_no_identified: str = "no_identified", + col_prop_samples: str = "prop_samples", +) -> pd.DataFrame: """Dataset of repeated samples indicating if an observation has the variables observed or missing x in {0,1}""" if X.index.name: index_col = X.index.name else: - index_col = 'INDEX' + index_col = "INDEX" sample_stats = X.index.to_frame(index=False).reset_index() - sample_stats.columns = ['SampleID_int', index_col] + sample_stats.columns = ["SampleID_int", index_col] sample_stats.set_index(index_col, inplace=True) sample_stats[col_no_identified] = X.sum(axis=1) sample_stats[col_no_missing] = (X == 0).sum(axis=1) - assert all(sample_stats[[col_no_identified, col_no_missing]].sum( - axis=1) == X.shape[1]) - sample_stats = sample_stats.sort_values( - by=col_no_identified, ascending=False) - sample_stats[col_prop_samples] = np.array( - range(1, len(sample_stats) + 1)) / len(sample_stats) + assert all( + sample_stats[[col_no_identified, col_no_missing]].sum(axis=1) == X.shape[1] + ) + sample_stats = sample_stats.sort_values(by=col_no_identified, ascending=False) + sample_stats[col_prop_samples] = np.array(range(1, len(sample_stats) + 1)) / len( + sample_stats + ) return sample_stats diff --git a/pimmslearn/databases/diseases.py b/pimmslearn/databases/diseases.py index 5c1307792..361bbbd28 100644 --- a/pimmslearn/databases/diseases.py +++ b/pimmslearn/databases/diseases.py @@ -5,18 +5,22 @@ def get_disease_association(doid: int, limit: int = 1000): - params = {'type1': -26, - 'id1': f'DOID:{doid}', - 'type2': 9606, - 'limit': limit, - 'format': 'json'} - diseases_url_all = 'https://api.jensenlab.org/Integration' + params = { + "type1": -26, + "id1": f"DOID:{doid}", + "type2": 9606, + "limit": limit, + "format": "json", + } + diseases_url_all = "https://api.jensenlab.org/Integration" r = requests.get(diseases_url_all, params=params) if r.status_code == 200: data, is_there_more = r.json() else: - raise ValueError(f"Could not get valid data back, response code: {r.status_code}") + raise ValueError( + f"Could not get valid data back, response code: {r.status_code}" + ) if is_there_more: logger.warning("There are more associations available") return data diff --git a/pimmslearn/filter.py b/pimmslearn/filter.py index 2d26c4806..f4cc86a4e 100644 --- a/pimmslearn/filter.py +++ b/pimmslearn/filter.py @@ -5,11 +5,10 @@ logger = logging.getLogger(__name__) -def select_features(df: pd.DataFrame, - feat_prevalence: float = .2, - axis: int = 0) -> pd.DataFrame: - """Select features or samples with a minimum prevalence. - """ +def select_features( + df: pd.DataFrame, feat_prevalence: float = 0.2, axis: int = 0 +) -> pd.DataFrame: + """Select features or samples with a minimum prevalence.""" N = df.shape[axis] minimum_freq = N * feat_prevalence freq = df.notna().sum(axis=axis) diff --git a/pimmslearn/imputation.py b/pimmslearn/imputation.py index 36b065516..65216c88d 100644 --- a/pimmslearn/imputation.py +++ b/pimmslearn/imputation.py @@ -5,6 +5,7 @@ """ + import logging from typing import Dict, Tuple @@ -17,12 +18,14 @@ RANDOMSEED = 123 -def impute_shifted_normal(df_wide: pd.DataFrame, - mean_shift: float = 1.8, - std_shrinkage: float = 0.3, - completeness: float = 0.6, - axis=1, - seed=RANDOMSEED) -> pd.Series: +def impute_shifted_normal( + df_wide: pd.DataFrame, + mean_shift: float = 1.8, + std_shrinkage: float = 0.3, + completeness: float = 0.6, + axis=1, + seed=RANDOMSEED, +) -> pd.Series: """Get replacements for missing values. Parameters @@ -51,9 +54,11 @@ def impute_shifted_normal(df_wide: pd.DataFrame, selected = df_wide.dropna(axis=0, thresh=min_M) else: raise ValueError( - "Please specify axis as 0 or 1, for axis along which to impute.") + "Please specify axis as 0 or 1, for axis along which to impute." + ) logger.info( - f"Meand and standard deviation based on seleted data of shape {selected.shape}") + f"Meand and standard deviation based on seleted data of shape {selected.shape}" + ) mean = selected.mean(axis=axis) std = selected.std(axis=axis) mean_shifted = mean - (std * mean_shift) @@ -66,30 +71,42 @@ def impute_shifted_normal(df_wide: pd.DataFrame, imputed_shifted_normal = pd.DataFrame( np.random.normal(mean_shifted, std_shrinked, size=(M, N)), index=df_wide.columns, - columns=df_wide.index) + columns=df_wide.index, + ) imputed_shifted_normal = imputed_shifted_normal.T else: imputed_shifted_normal = pd.DataFrame( np.random.normal(mean_shifted, std_shrinked, size=(N, M)), index=df_wide.index, - columns=df_wide.columns) + columns=df_wide.columns, + ) imputed_shifted_normal = imputed_shifted_normal[df_wide.isna()].stack() return imputed_shifted_normal -def compute_moments_shift(observed: pd.Series, imputed: pd.Series, - names: Tuple[str, str] = ('observed', 'imputed')) -> Dict[str, float]: +def compute_moments_shift( + observed: pd.Series, + imputed: pd.Series, + names: Tuple[str, str] = ("observed", "imputed"), +) -> Dict[str, float]: """Summary of overall shift of mean and std. dev. of predictions for a imputation method.""" name_obs, name_model = names - data = {name: {'mean': series.mean(), 'std': series.std()} for series, name in zip([observed, imputed], names)} + data = { + name: {"mean": series.mean(), "std": series.std()} + for series, name in zip([observed, imputed], names) + } observed, imputed = data[name_obs], data[name_model] - data[name_model]['mean shift (in std)'] = (observed["mean"] - imputed["mean"]) / observed["std"] - data[name_model]['std shrinkage'] = imputed["std"] / observed["std"] + data[name_model]["mean shift (in std)"] = ( + observed["mean"] - imputed["mean"] + ) / observed["std"] + data[name_model]["std shrinkage"] = imputed["std"] / observed["std"] return data -def stats_by_level(series: pd.Series, index_level: int = 0, min_count: int = 5) -> pd.Series: +def stats_by_level( + series: pd.Series, index_level: int = 0, min_count: int = 5 +) -> pd.Series: """Count, mean and std. dev. by index level.""" - agg = series.groupby(level=index_level).agg(['count', 'mean', 'std']) - agg = agg.loc[agg['count'] > min_count] + agg = series.groupby(level=index_level).agg(["count", "mean", "std"]) + agg = agg.loc[agg["count"] > min_count] return agg.mean() diff --git a/pimmslearn/io/__init__.py b/pimmslearn/io/__init__.py index f86ceed99..2454afbd5 100644 --- a/pimmslearn/io/__init__.py +++ b/pimmslearn/io/__init__.py @@ -10,14 +10,14 @@ import pimmslearn.pandas -PathsList = namedtuple('PathsList', ['files', 'folder']) +PathsList = namedtuple("PathsList", ["files", "folder"]) logger = logging.getLogger(__name__) logger.info(f"Calling from {__name__}") -def search_files(path='.', query='.txt'): +def search_files(path=".", query=".txt"): """Uses Pathlib to find relative to path files with the query text in their file names. Returns the path relative to the specified path. @@ -42,27 +42,27 @@ def search_files(path='.', query='.txt'): return PathsList(files=files, folder=path) -def search_subfolders(path='.', depth: int = 1, exclude_root: bool = False): +def search_subfolders(path=".", depth: int = 1, exclude_root: bool = False): """Search subfolders relative to given path.""" if not isinstance(depth, int) and depth > 0: - raise ValueError( - f"Please provide an strictly positive integer, not {depth}") + raise ValueError(f"Please provide an strictly positive integer, not {depth}") EXCLUDED = ["*ipynb_checkpoints*"] path = Path(path) directories = [path] def get_subfolders(path): - return [x for x in path.iterdir() - if x.is_dir() and not any(x.match(excl) for excl in EXCLUDED) - ] + return [ + x + for x in path.iterdir() + if x.is_dir() and not any(x.match(excl) for excl in EXCLUDED) + ] directories_previous = directories.copy() while depth > 0: directories_new = list() for p in directories_previous: - directories_new.extend( - get_subfolders(p)) + directories_new.extend(get_subfolders(p)) directories.extend(directories_new) directories_previous = directories_new.copy() depth -= 1 @@ -72,30 +72,30 @@ def get_subfolders(path): return directories -def resolve_path(path: Union[str, Path], to: Union[str, Path] = '.') -> Path: +def resolve_path(path: Union[str, Path], to: Union[str, Path] = ".") -> Path: """Resolve a path partly overlapping with to another path.""" pwd = Path(to).absolute() pwd = [p for p in pwd.parts] ret = [p for p in Path(path).parts if p not in pwd] - return Path('/'.join(ret)) + return Path("/".join(ret)) -def get_fname_from_keys(keys, folder='.', file_ext='.pkl', remove_duplicates=True): +def get_fname_from_keys(keys, folder=".", file_ext=".pkl", remove_duplicates=True): if remove_duplicates: # https://stackoverflow.com/a/53657523/9684872 keys = list(dict.fromkeys(keys)) folder = Path(folder) folder.mkdir(exist_ok=True, parents=True) - fname_dataset = folder / '{}{}'.format(pimmslearn.pandas.replace_with( - ' '.join(keys), replace='- ', replace_with='_'), file_ext) + fname_dataset = folder / "{}{}".format( + pimmslearn.pandas.replace_with(" ".join(keys), replace="- ", replace_with="_"), + file_ext, + ) return fname_dataset -def dump_to_csv(df: pd.DataFrame, - folder: Path, - outfolder: Path, - parent_folder_fct=None - ) -> None: +def dump_to_csv( + df: pd.DataFrame, folder: Path, outfolder: Path, parent_folder_fct=None +) -> None: fname = f"{folder.stem}.csv" if parent_folder_fct is not None: outfolder = outfolder / parent_folder_fct(folder) @@ -116,17 +116,17 @@ def dump_json(data_dict: dict, filename: Union[str, Path]): filename : Union[str, Path] Filepath to save dictionary as JSON. """ - with open(filename, 'w') as f: + with open(filename, "w") as f: json.dump(obj=data_dict, fp=f, indent=4) def to_pickle(obj, fname): - with open(fname, 'wb') as f: + with open(fname, "wb") as f: pickle.dump(obj, f) def from_pickle(fname): - with open(fname, 'rb') as f: + with open(fname, "rb") as f: return pickle.load(f) @@ -148,13 +148,17 @@ def load_json(fname: Union[str, Path]) -> dict: return d -def parse_dict(input_dict: dict, - types: Tuple[Tuple] = ((PurePath, lambda p: str(PurePosixPath(p))), - (np.ndarray, lambda a: a.to_list()))): +def parse_dict( + input_dict: dict, + types: Tuple[Tuple] = ( + (PurePath, lambda p: str(PurePosixPath(p))), + (np.ndarray, lambda a: a.to_list()), + ), +): """Transform a set of items (instances) to their string representation""" d = dict() for k, v in input_dict.items(): - for (old_type, fct) in types: + for old_type, fct in types: if isinstance(v, old_type): v = fct(v) d[k] = v @@ -184,8 +188,9 @@ def extend_name(fname: Union[str, Path], extend_by: str, ext: str = None) -> Pat return fname -def add_indices(array: np.array, original_df: pd.DataFrame, - index_only: bool = False) -> pd.DataFrame: +def add_indices( + array: np.array, original_df: pd.DataFrame, index_only: bool = False +) -> pd.DataFrame: """Add indices to array using provided origional DataFrame. Parameters diff --git a/pimmslearn/io/dataloaders.py b/pimmslearn/io/dataloaders.py index a49776a35..ded2e109b 100644 --- a/pimmslearn/io/dataloaders.py +++ b/pimmslearn/io/dataloaders.py @@ -1,4 +1,3 @@ - import pandas import pandas as pd from fastai.data.all import * @@ -11,11 +10,13 @@ from pimmslearn.transform import VaepPipeline -def get_dls(train_X: pandas.DataFrame, - valid_X: pandas.DataFrame, - transformer: VaepPipeline, - bs: int = 64, - num_workers=0) -> DataLoaders: +def get_dls( + train_X: pandas.DataFrame, + valid_X: pandas.DataFrame, + transformer: VaepPipeline, + bs: int = 64, + num_workers=0, +) -> DataLoaders: """Create training and validation dataloaders Parameters @@ -55,12 +56,11 @@ def get_dls(train_X: pandas.DataFrame, decode=['normalize']) dls = get_dls(train_X, val_X, transforms, bs=4) """ - train_ds = datasets.DatasetWithTarget(df=train_X, - transformer=transformer) + train_ds = datasets.DatasetWithTarget(df=train_X, transformer=transformer) if valid_X is not None: - valid_ds = datasets.DatasetWithTargetSpecifyTarget(df=train_X, - targets=valid_X, - transformer=transformer) + valid_ds = datasets.DatasetWithTargetSpecifyTarget( + df=train_X, targets=valid_X, transformer=transformer + ) else: # empty dataset will be ignored by fastai in training loops valid_ds = datasets.DatasetWithTarget(df=pd.DataFrame()) @@ -70,16 +70,19 @@ def get_dls(train_X: pandas.DataFrame, if (len(train_X) % bs) == 1: # Batch-Normalization does not work with batches of size one drop_last = True - return DataLoaders.from_dsets(train_ds, valid_ds, bs=bs, drop_last=drop_last, - num_workers=num_workers) + return DataLoaders.from_dsets( + train_ds, valid_ds, bs=bs, drop_last=drop_last, num_workers=num_workers + ) # dls.test_dl # needs to be part of setup procedure of a class -def get_test_dl(df: pandas.DataFrame, - transformer: VaepPipeline, - dataset: Dataset = DatasetWithTarget, - bs: int = 64): +def get_test_dl( + df: pandas.DataFrame, + transformer: VaepPipeline, + dataset: Dataset = DatasetWithTarget, + bs: int = 64, +): """[summary] Parameters diff --git a/pimmslearn/io/datasets.py b/pimmslearn/io/datasets.py index a66fa5723..3807a6933 100644 --- a/pimmslearn/io/datasets.py +++ b/pimmslearn/io/datasets.py @@ -12,7 +12,7 @@ class PeptideDatasetInMemory(Dataset): """Peptide Dataset fully in memory.""" - nan = torch.tensor(float('NaN')) + nan = torch.tensor(float("NaN")) def __init__(self, data: np.array, mask: np.array = None, fill_na=0.0): """Build torch.Tensors for DataLoader. @@ -36,11 +36,11 @@ def __init__(self, data: np.array, mask: np.array = None, fill_na=0.0): self.y = torch.where(~self.mask, self.peptides, self.nan) if mask is not None: - self.peptides = torch.where( - ~self.mask, self.nan, self.peptides) + self.peptides = torch.where(~self.mask, self.nan, self.peptides) - self.peptides = torch.where(self.peptides.isnan(), - torch.FloatTensor([fill_na]), self.peptides) + self.peptides = torch.where( + self.peptides.isnan(), torch.FloatTensor([fill_na]), self.peptides + ) self.length_ = len(self.peptides) @@ -62,17 +62,17 @@ class DatasetWithMaskAndNoTarget(Dataset): def __init__(self, df: pd.DataFrame, transformer: sklearn.pipeline.Pipeline = None): if not issubclass(type(df), pd.DataFrame): - raise ValueError( - f'please pass a pandas DataFrame, not: {type(df) = }') + raise ValueError(f"please pass a pandas DataFrame, not: {type(df) = }") self.mask_isna = df.isna() # .astype('uint8') # in case 0,1 is preferred self.columns = df.columns self.transformer = transformer if transformer: - if hasattr(transformer, 'transform'): + if hasattr(transformer, "transform"): df = transformer.transform(df) else: raise AttributeError( - f'{type(transformer)} is not sklearn compatible, has no inverse_transform.') + f"{type(transformer)} is not sklearn compatible, has no inverse_transform." + ) self.data = df self.length_ = len(self.data) @@ -95,8 +95,12 @@ def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: class DatasetWithTargetSpecifyTarget(DatasetWithMaskAndNoTarget): - def __init__(self, df: pd.DataFrame, targets: pd.DataFrame, - transformer: sklearn.pipeline.Pipeline = None): + def __init__( + self, + df: pd.DataFrame, + targets: pd.DataFrame, + transformer: sklearn.pipeline.Pipeline = None, + ): """Create a dataset for validation. Parameters @@ -109,21 +113,23 @@ def __init__(self, df: pd.DataFrame, targets: pd.DataFrame, transformation pipeline to use, by default None """ if not issubclass(type(df), pd.DataFrame): - raise ValueError( - f'please pass a pandas DataFrame, not: {type(df) = }') + raise ValueError(f"please pass a pandas DataFrame, not: {type(df) = }") self.mask_isna = targets.isna() self.columns = df.columns self.transformer = transformer - self.target = df.fillna(targets) # not really necessary, without mask would not be needed + self.target = df.fillna( + targets + ) # not really necessary, without mask would not be needed if transformer: - if hasattr(transformer, 'transform'): + if hasattr(transformer, "transform"): df = transformer.transform(df) self.target = transformer.transform(self.target) else: raise AttributeError( - f'{type(transformer)} is not sklearn compatible, has no inverse_transform.') + f"{type(transformer)} is not sklearn compatible, has no inverse_transform." + ) self.data = df self.length_ = len(self.data) diff --git a/pimmslearn/io/datasplits.py b/pimmslearn/io/datasplits.py index e3e886724..70498d414 100644 --- a/pimmslearn/io/datasplits.py +++ b/pimmslearn/io/datasplits.py @@ -13,15 +13,18 @@ logger = logging.getLogger(__name__) -FILE_FORMAT_TO_DUMP_FCT = {'pkl': ('to_pickle', 'read_pickle'), - # 'pickle': 'to_pickle', - 'csv': ('to_csv', 'read_csv')} - - -def long_format(df: pd.DataFrame, - colname_values: str = 'intensity', - # index_name: str = 'Sample ID' - ) -> pd.DataFrame: +FILE_FORMAT_TO_DUMP_FCT = { + "pkl": ("to_pickle", "read_pickle"), + # 'pickle': 'to_pickle', + "csv": ("to_csv", "read_csv"), +} + + +def long_format( + df: pd.DataFrame, + colname_values: str = "intensity", + # index_name: str = 'Sample ID' +) -> pd.DataFrame: # ToDo: Docstring as in class when finalized names = df.columns.names if None in names: @@ -35,9 +38,9 @@ def long_format(df: pd.DataFrame, ) -def wide_format(df: pd.DataFrame, - columns: str = 'Sample ID', - name_values: str = 'intensity') -> pd.DataFrame: +def wide_format( + df: pd.DataFrame, columns: str = "Sample ID", name_values: str = "intensity" +) -> pd.DataFrame: # ToDo: Docstring as in class when finalized df_wide = df.pivot(columns=columns, values=name_values) df_wide = df_wide.T @@ -45,7 +48,7 @@ def wide_format(df: pd.DataFrame, @dataclass -class DataSplits(): +class DataSplits: is_wide_format: bool = field(init=True, repr=False) train_X: pd.DataFrame = None val_y: pd.DataFrame = None @@ -54,26 +57,39 @@ class DataSplits(): def __post_init__(self): self._items = sorted(self.__dict__) self._is_wide = self.is_wide_format - self._items.remove('is_wide_format') + self._items.remove("is_wide_format") def __getitem__(self, index): return (self._items[index], getattr(self, self._items[index])) def __dir__(self): # return self._items - return ['dump', 'from_folder', 'interpolate', 'load', 'test_X', 'test_y', - 'to_long_format', 'to_wide_format', 'train_X', 'val_X', 'val_y'] - - def dump(self, folder='data', file_format='csv') -> dict: + return [ + "dump", + "from_folder", + "interpolate", + "load", + "test_X", + "test_y", + "to_long_format", + "to_wide_format", + "train_X", + "val_X", + "val_y", + ] + + def dump(self, folder="data", file_format="csv") -> dict: """dump in long format.""" folder = Path(folder) folder.mkdir(parents=True, exist_ok=True) if file_format not in FILE_FORMAT_TO_DUMP_FCT: - raise ValueError(f"Select one of these formats: {', '.join(FILE_FORMAT_TO_DUMP_FCT.keys())}") + raise ValueError( + f"Select one of these formats: {', '.join(FILE_FORMAT_TO_DUMP_FCT.keys())}" + ) dumps = {} n_dumped = 0 - for (_attr, _df) in self: + for _attr, _df in self: if _df is None: logger.info(f"Missing attribute: {_attr}") continue @@ -94,26 +110,38 @@ def dump(self, folder='data', file_format='csv') -> dict: dump_fct(fname) n_dumped += 1 if not n_dumped: - raise ValueError(f'Nothing to dump, all None: {self}') + raise ValueError(f"Nothing to dump, all None: {self}") # _df.to_json(fname) # does not work easily for series return dumps - def load(self, folder: str, use_wide_format=False, file_format='csv') -> None: + def load(self, folder: str, use_wide_format=False, file_format="csv") -> None: """Load data in place from folder""" items = dict(self.__annotations__) - del items['is_wide_format'] - args = load_items(folder=folder, items=items, use_wide_format=use_wide_format, file_format=file_format) + del items["is_wide_format"] + args = load_items( + folder=folder, + items=items, + use_wide_format=use_wide_format, + file_format=file_format, + ) for _attr, _df in args.items(): setattr(self, _attr, _df) self._is_wide = use_wide_format return None # could also be self @classmethod - def from_folder(cls, folder: str, use_wide_format=False, file_format='csv') -> DataSplits: + def from_folder( + cls, folder: str, use_wide_format=False, file_format="csv" + ) -> DataSplits: """Build DataSplits instance from folder.""" items = dict(cls.__annotations__) - del items['is_wide_format'] - args = load_items(folder=folder, items=items, use_wide_format=use_wide_format, file_format=file_format) + del items["is_wide_format"] + args = load_items( + folder=folder, + items=items, + use_wide_format=use_wide_format, + file_format=file_format, + ) _data_splits = cls(**args, is_wide_format=use_wide_format) _data_splits._is_wide = use_wide_format return _data_splits @@ -129,7 +157,7 @@ def to_wide_format(self): setattr(self, _attr, _df) self._is_wide = True - def to_long_format(self, name_values: str = 'intensity'): + def to_long_format(self, name_values: str = "intensity"): if not self._is_wide: return @@ -151,34 +179,42 @@ def interpolate(self, dataset: Union[str, pd.DataFrame]): try: ds = getattr(self, dataset) except AttributeError: - raise AttributeError(f"Please provide a valid attribute, not '{dataset}'. " - "Valid attributes are {}".format(', '.join(x for x in self._items))) - if dataset[-1] in ['y', 'Y']: + raise AttributeError( + f"Please provide a valid attribute, not '{dataset}'. " + "Valid attributes are {}".format(", ".join(x for x in self._items)) + ) + if dataset[-1] in ["y", "Y"]: logger.warning( - f'Attempting to interpolate target: {dataset} ' - '(this might make sense, but a warning') + f"Attempting to interpolate target: {dataset} " + "(this might make sense, but a warning" + ) if ds is None: - raise ValueError(f'Attribute is None: {dataset!r}.') + raise ValueError(f"Attribute is None: {dataset!r}.") if not self._is_wide: ds = ds.unstack() # series is unstack to DataFrame else: - raise TypeError(f"Unknown type: {classname(dataset)}." - f" None of str, {class_full_module(pd.DataFrame)}, {class_full_module(pd.Series)}" - ) + raise TypeError( + f"Unknown type: {classname(dataset)}." + f" None of str, {class_full_module(pd.DataFrame)}, {class_full_module(pd.Series)}" + ) return interpolate(wide_df=ds) -def load_items(folder: str, items: dict, use_wide_format=False, file_format='csv') -> dict: +def load_items( + folder: str, items: dict, use_wide_format=False, file_format="csv" +) -> dict: folder = Path(folder) - assert folder.exists(), f'Could not find folder: {folder}' + assert folder.exists(), f"Could not find folder: {folder}" args = {} for _attr, _cls in items.items(): # assert issubclass(_cls, (pd.DataFrame, pd.Series)) # now strings, see # https://docs.python.org/3/whatsnew/3.7.html#pep-563-postponed-evaluation-of-annotations fname = folder / f"{_attr}.{file_format}" if not fname.exists(): - raise FileNotFoundError(f"Missing file requested for attr '{_attr}', missing {fname}") + raise FileNotFoundError( + f"Missing file requested for attr '{_attr}', missing {fname}" + ) read_fct = getattr(pd, FILE_FORMAT_TO_DUMP_FCT[file_format][1]) _df = read_fct(fname) # logic below is suited for csv reader -> maybe split up loading and saving later? @@ -186,7 +222,9 @@ def load_items(folder: str, items: dict, use_wide_format=False, file_format='csv _df = _df.to_frame().reset_index() # in case Series was pickled cols = list(_df.columns) if use_wide_format: - _df = wide_format(_df.set_index(cols[1:-1]), columns=cols[0], name_values=cols[-1]) + _df = wide_format( + _df.set_index(cols[1:-1]), columns=cols[0], name_values=cols[-1] + ) else: _df.set_index(cols[:-1], inplace=True) logger.info(f"Loaded '{_attr}' from file: {fname}") @@ -195,13 +233,13 @@ def load_items(folder: str, items: dict, use_wide_format=False, file_format='csv # set default file name -> intergrate into DataSplits? -def load_freq(folder: str, file='freq_features.pkl'): +def load_freq(folder: str, file="freq_features.pkl"): folder = Path(folder) fname = folder / file - if fname.suffix == '.json': - freq_per_feature = pd.read_json(fname, orient='index').squeeze() - freq_per_feature.name = 'freq' - elif fname.suffix == '.pkl': + if fname.suffix == ".json": + freq_per_feature = pd.read_json(fname, orient="index").squeeze() + freq_per_feature.name = "freq" + elif fname.suffix == ".pkl": freq_per_feature = pd.read_pickle(fname) else: raise ValueError(f"Unknown Fileextension: {fname.suffix}") diff --git a/pimmslearn/io/format.py b/pimmslearn/io/format.py index 8ac18df95..912b8511f 100644 --- a/pimmslearn/io/format.py +++ b/pimmslearn/io/format.py @@ -35,12 +35,12 @@ class bcolors: There are more options available: """ - HEADER = '\033[95m' - OKBLUE = '\033[94m' - OKCYAN = '\033[96m' - OKGREEN = '\033[92m' - WARNING = '\033[93m' - FAIL = '\033[91m' - ENDC = '\033[0m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' + HEADER = "\033[95m" + OKBLUE = "\033[94m" + OKCYAN = "\033[96m" + OKGREEN = "\033[92m" + WARNING = "\033[93m" + FAIL = "\033[91m" + ENDC = "\033[0m" + BOLD = "\033[1m" + UNDERLINE = "\033[4m" diff --git a/pimmslearn/io/load.py b/pimmslearn/io/load.py index 7986bd0d8..b29a80070 100644 --- a/pimmslearn/io/load.py +++ b/pimmslearn/io/load.py @@ -6,56 +6,71 @@ logger = logging.getLogger(__name__) -def verify_df(df: pd.DataFrame, - fname: str, - index_col: str, # could be potentially 0 for the first column - verify_fname: bool = False, - usecols=None, - ): +def verify_df( + df: pd.DataFrame, + fname: str, + index_col: str, # could be potentially 0 for the first column + verify_fname: bool = False, + usecols=None, +): if usecols and isinstance(index_col, str): - assert index_col in usecols, 'Add index_col to usecols Sequence' + assert index_col in usecols, "Add index_col to usecols Sequence" if verify_fname: if not len(df.shape) == 2: - raise ValueError(f"Expected 2 -dimensional array, not {len(df.shape)} -dimensional," - f" of type: {type(df)}") + raise ValueError( + f"Expected 2 -dimensional array, not {len(df.shape)} -dimensional," + f" of type: {type(df)}" + ) N, M = df.shape - assert f'N{N:05d}' in str(fname) and f'M{M:05d}' in str(fname), \ - ("Filename number don't match loaded numbers: " - f"{fname} should contain N{N} and M{M}") - - -def from_csv(fname: str, - nrows: int = None, - # could be potentially 0 for the first column - index_col: Union[int, str, List] = 'Sample ID', - verify_fname: bool = False, - usecols=None, - **kwargs): + assert f"N{N:05d}" in str(fname) and f"M{M:05d}" in str(fname), ( + "Filename number don't match loaded numbers: " + f"{fname} should contain N{N} and M{M}" + ) + + +def from_csv( + fname: str, + nrows: int = None, + # could be potentially 0 for the first column + index_col: Union[int, str, List] = "Sample ID", + verify_fname: bool = False, + usecols=None, + **kwargs, +): logger.warning(f"Passed unknown kwargs: {kwargs}") - df = pd.read_csv(fname, index_col=index_col, low_memory=False, - nrows=nrows, usecols=usecols).squeeze('columns') + df = pd.read_csv( + fname, index_col=index_col, low_memory=False, nrows=nrows, usecols=usecols + ).squeeze("columns") if len(df.shape) == 1: # unstack all but first column df = df.unstack(df.index.names[1:]) - verify_df(df=df, fname=fname, - index_col=index_col, - verify_fname=verify_fname, - usecols=usecols) + verify_df( + df=df, + fname=fname, + index_col=index_col, + verify_fname=verify_fname, + usecols=usecols, + ) return df # all __init__ parameters are kwargs -def from_pickle(fname: str, - # could be potentially 0 for the first column - index_col: Union[int, str, List] = 'Sample ID', - verify_fname: bool = False, - usecols=None, - **kwargs) -> pd.DataFrame: +def from_pickle( + fname: str, + # could be potentially 0 for the first column + index_col: Union[int, str, List] = "Sample ID", + verify_fname: bool = False, + usecols=None, + **kwargs, +) -> pd.DataFrame: logger.warning(f"Passed unknown kwargs: {kwargs}") df = pd.read_pickle(fname).squeeze() if len(df.shape) == 1: df = df.unstack(df.index.names[1:]) - verify_df(df=df, fname=fname, - index_col=index_col, - verify_fname=verify_fname, - usecols=usecols) + verify_df( + df=df, + fname=fname, + index_col=index_col, + verify_fname=verify_fname, + usecols=usecols, + ) return df # all __init__ parameters are kwargs diff --git a/pimmslearn/logging.py b/pimmslearn/logging.py index d05f14474..dacd3a39e 100644 --- a/pimmslearn/logging.py +++ b/pimmslearn/logging.py @@ -1,18 +1,23 @@ """Custom logging setup for notebooks.""" + from pathlib import Path from datetime import datetime import logging import sys -LOG_FOLDER = Path('logs') +LOG_FOLDER = Path("logs") LOG_FOLDER.mkdir(exist_ok=True) -def setup_nb_logger(level: int = logging.INFO, - format_str: str = '%(name)s - %(levelname)-8s %(message)s') -> None: +def setup_nb_logger( + level: int = logging.INFO, + format_str: str = "%(name)s - %(levelname)-8s %(message)s", +) -> None: logging.basicConfig(level=level, format=format_str) root_logger = logging.getLogger() - root_logger.setLevel(level) # in case root_logger existed already before calling basicConfig + root_logger.setLevel( + level + ) # in case root_logger existed already before calling basicConfig c_format = logging.Formatter(format_str) if root_logger.handlers: handler = root_logger.handlers[0] @@ -51,7 +56,7 @@ def setup_logger_w_file(logger, level=logging.INFO, fname_base=None): logger.setLevel(level) logger.handlers = [] # remove any handler in case you reexecute the cell - c_format = logging.Formatter('%(name)s - %(levelname)-8s %(message)s') + c_format = logging.Formatter("%(name)s - %(levelname)-8s %(message)s") c_handler = logging.StreamHandler(sys.stdout) c_handler.setLevel(level) @@ -61,7 +66,8 @@ def setup_logger_w_file(logger, level=logging.INFO, fname_base=None): if fname_base: date_log_file = "{:%y%m%d_%H%M}".format(datetime.now()) f_handler = logging.FileHandler( - LOG_FOLDER / f"{fname_base}_{date_log_file}.txt") + LOG_FOLDER / f"{fname_base}_{date_log_file}.txt" + ) f_handler.setLevel(level) f_handler.setFormatter(c_format) logger.addHandler(f_handler) diff --git a/pimmslearn/model.py b/pimmslearn/model.py index 07ff775d7..22f6423c4 100644 --- a/pimmslearn/model.py +++ b/pimmslearn/model.py @@ -16,10 +16,12 @@ def build_df_from_pred_batches(pred, scaler=None, index=None, columns=None): return pred -def get_latent_space(model_method_call: callable, - dl: torch.utils.data.DataLoader, - dl_index: pd.Index, - latent_tuple_pos: int = 0) -> pd.DataFrame: +def get_latent_space( + model_method_call: callable, + dl: torch.utils.data.DataLoader, + dl_index: pd.Index, + latent_tuple_pos: int = 0, +) -> pd.DataFrame: """Create a DataFrame of the latent space based on the model method call to be used (here: the model encoder or a latent space helper method) @@ -51,8 +53,9 @@ def get_latent_space(model_method_call: callable, latent_space.append(res) M = res.shape[-1] - latent_space = build_df_from_pred_batches(latent_space, - index=dl_index, - columns=[f'latent dimension {i+1}' - for i in range(M)]) + latent_space = build_df_from_pred_batches( + latent_space, + index=dl_index, + columns=[f"latent dimension {i+1}" for i in range(M)], + ) return latent_space diff --git a/pimmslearn/models/__init__.py b/pimmslearn/models/__init__.py index be54b97bf..c11ef2228 100644 --- a/pimmslearn/models/__init__.py +++ b/pimmslearn/models/__init__.py @@ -22,18 +22,32 @@ NUMPY_ONE = np.int64(1) -__all__ = ['ae', 'analysis', 'collab', 'vae', 'plot_loss', 'plot_training_losses', - 'calc_net_weight_count', 'RecorderDump', 'split_prediction_by_mask', - 'compare_indices', 'collect_metrics', 'calculte_metrics', - 'Metrics', 'get_df_from_nested_dict'] - - -def plot_loss(recorder: learner.Recorder, - norm_train: np.int64 = NUMPY_ONE, - norm_val: np.int64 = NUMPY_ONE, - skip_start: int = 5, - with_valid: bool = True, - ax: plt.Axes = None) -> plt.Axes: +__all__ = [ + "ae", + "analysis", + "collab", + "vae", + "plot_loss", + "plot_training_losses", + "calc_net_weight_count", + "RecorderDump", + "split_prediction_by_mask", + "compare_indices", + "collect_metrics", + "calculte_metrics", + "Metrics", + "get_df_from_nested_dict", +] + + +def plot_loss( + recorder: learner.Recorder, + norm_train: np.int64 = NUMPY_ONE, + norm_val: np.int64 = NUMPY_ONE, + skip_start: int = 5, + with_valid: bool = True, + ax: plt.Axes = None, +) -> plt.Axes: """Adapted Recorder.plot_loss to accept matplotlib.axes.Axes argument. Allows to build combined graphics. @@ -59,41 +73,53 @@ def plot_loss(recorder: learner.Recorder, """ if not ax: _, ax = plt.subplots() - ax.plot(list(range(skip_start, len(recorder.losses))), - recorder.losses[skip_start:] / norm_train, label='train') + ax.plot( + list(range(skip_start, len(recorder.losses))), + recorder.losses[skip_start:] / norm_train, + label="train", + ) if with_valid: idx = (np.array(recorder.iters) < skip_start).sum() - ax.plot(recorder.iters[idx:], L( - recorder.values[idx:]).itemgot(1) / norm_val, label='valid') + ax.plot( + recorder.iters[idx:], + L(recorder.values[idx:]).itemgot(1) / norm_val, + label="valid", + ) ax.legend() return ax -NORM_ONES = np.array([1, 1], dtype='int') +NORM_ONES = np.array([1, 1], dtype="int") -def plot_training_losses(learner: learner.Learner, - name: str, - ax=None, - norm_factors=NORM_ONES, - folder = None, - figsize=(15, 8)): +def plot_training_losses( + learner: learner.Learner, + name: str, + ax=None, + norm_factors=NORM_ONES, + folder=None, + figsize=(15, 8), +): if ax is None: fig, ax = plt.subplots(figsize=figsize) else: fig = ax.get_figure() - ax.set_title(f'{name} loss') + ax.set_title(f"{name} loss") norm_train, norm_val = norm_factors # exactly two with_valid = True if norm_val is None: with_valid = False - learner.recorder.plot_loss(skip_start=5, ax=ax, with_valid=with_valid, - norm_train=norm_train, norm_val=norm_val) + learner.recorder.plot_loss( + skip_start=5, + ax=ax, + with_valid=with_valid, + norm_train=norm_train, + norm_val=norm_val, + ) if not folder is None: name = name.lower() _ = RecorderDump(learner.recorder, name).save(folder) - pimmslearn.savefig(fig, name=f'{name}_training', - folder=folder) + pimmslearn.savefig(fig, name=f"{name}_training", folder=folder) return fig @@ -107,9 +133,9 @@ def calc_net_weight_count(model: torch.nn.modules.module.Module) -> int: class RecorderDump: - """Simple Class to hold fastai Recorder Callback data for serialization using pickle. - """ - filename_tmp = 'recorder_{}.pkl' + """Simple Class to hold fastai Recorder Callback data for serialization using pickle.""" + + filename_tmp = "recorder_{}.pkl" def __init__(self, recorder, name): self.losses = recorder.losses @@ -117,22 +143,22 @@ def __init__(self, recorder, name): self.iters = recorder.iters self.name = name - def save(self, folder='.'): - with open(Path(folder) / self.filename_tmp.format(self.name), 'wb') as f: + def save(self, folder="."): + with open(Path(folder) / self.filename_tmp.format(self.name), "wb") as f: pickle.dump(self, f) @classmethod def load(cls, filepath, name): - with open(Path(filepath) / cls.filename_tmp.format(name), 'rb') as f: + with open(Path(filepath) / cls.filename_tmp.format(name), "rb") as f: ret = pickle.load(f) return ret plot_loss = plot_loss -def split_prediction_by_mask(pred: pd.DataFrame, - mask: pd.DataFrame, - check_keeps_all: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]: +def split_prediction_by_mask( + pred: pd.DataFrame, mask: pd.DataFrame, check_keeps_all: bool = False +) -> Tuple[pd.DataFrame, pd.DataFrame]: """[summary] Parameters @@ -152,8 +178,9 @@ def split_prediction_by_mask(pred: pd.DataFrame, test_pred_observed = pred[~mask].stack() test_pred_real_na = pred[mask].stack() if check_keeps_all: - assert len(test_pred_real_na) + \ - len(test_pred_observed) == reduce(mul, pred.shape) + assert len(test_pred_real_na) + len(test_pred_observed) == reduce( + mul, pred.shape + ) return test_pred_observed, test_pred_real_na @@ -178,15 +205,16 @@ def compare_indices(first_index: pd.Index, second_index: pd.Index) -> pd.Index: """ _diff_index = first_index.difference(second_index) if len(_diff_index): - print("Some predictions couldn't be generated using the approach using artifical replicates.\n" - "These will be omitted for evaluation.") + print( + "Some predictions couldn't be generated using the approach using artifical replicates.\n" + "These will be omitted for evaluation." + ) for _index in _diff_index: print(f"{_index[0]:<40}\t {_index[1]:<40}") return _diff_index -scoring = [('MSE', sklm.mean_squared_error), - ('MAE', sklm.mean_absolute_error)] +scoring = [("MSE", sklm.mean_squared_error), ("MAE", sklm.mean_absolute_error)] def collect_metrics(metrics_jsons: List, key_fct: Callable) -> dict: @@ -227,18 +255,21 @@ def collect_metrics(metrics_jsons: List, key_fct: Callable) -> dict: for k, v in loaded.items(): if k in all_metrics[key]: logger.debug(f"Found existing key: {k = } ") - assert all_metrics[key][k] == v, "Diverging values for {k}: {v1} vs {v2}".format( - k=k, - v1=all_metrics[key][k], - v2=v) + assert ( + all_metrics[key][k] == v + ), "Diverging values for {k}: {v1} vs {v2}".format( + k=k, v1=all_metrics[key][k], v2=v + ) else: all_metrics[key][k] = v return all_metrics -def calculte_metrics(pred_df: pd.DataFrame, - true_col: List[str] = None, - scoring: List[Tuple[str, Callable]] = scoring) -> dict: +def calculte_metrics( + pred_df: pd.DataFrame, + true_col: List[str] = None, + scoring: List[Tuple[str, Callable]] = scoring, +) -> dict: """Create metrics based on predictions, a truth reference and a list of scoring function with a name. @@ -265,7 +296,7 @@ def calculte_metrics(pred_df: pd.DataFrame, if not true_col: # assume first column is truth if None is given y_true = pred_df.iloc[:, 0] - print(f'Selected as truth to compare to: {y_true.name}') + print(f"Selected as truth to compare to: {y_true.name}") y_pred = pred_df.iloc[:, 1:] else: if issubclass(type(true_col), int): @@ -276,10 +307,13 @@ def calculte_metrics(pred_df: pd.DataFrame, y_pred = pred_df.drop(true_col, axis=1) else: raise ValueError( - f'true_col has to be of type str or int, not {type(true_col)}') + f"true_col has to be of type str or int, not {type(true_col)}" + ) if y_true.isna().any(): - raise ValueError(f"Ground truth column '{y_true.name}' contains missing values. " - "Drop these rows first.") + raise ValueError( + f"Ground truth column '{y_true.name}' contains missing values. " + "Drop these rows first." + ) metrics = {} for model_key in y_pred: @@ -288,21 +322,34 @@ def calculte_metrics(pred_df: pd.DataFrame, if len(model_pred) > len(model_pred_no_na): logger.info( f"Drop indices for {model_key}: " - "{}".format([(idx[0], idx[1]) - for idx - in model_pred.index.difference(model_pred_no_na.index)])) + "{}".format( + [ + (idx[0], idx[1]) + for idx in model_pred.index.difference(model_pred_no_na.index) + ] + ) + ) metrics[model_key] = dict( - [(k, float(f(y_true=y_true.loc[model_pred_no_na.index], - y_pred=model_pred_no_na))) - for k, f in scoring] + [ + ( + k, + float( + f( + y_true=y_true.loc[model_pred_no_na.index], + y_pred=model_pred_no_na, + ) + ), + ) + for k, f in scoring + ] ) - metrics[model_key]['N'] = int(len(model_pred_no_na)) - metrics[model_key]['prop'] = len(model_pred_no_na) / len(model_pred) + metrics[model_key]["N"] = int(len(model_pred_no_na)) + metrics[model_key]["prop"] = len(model_pred_no_na) / len(model_pred) return metrics -class Metrics(): +class Metrics: def __init__(self): self.metrics = {} @@ -315,15 +362,14 @@ def __repr__(self): return pprint.pformat(self.metrics, indent=2, compact=True) -def get_df_from_nested_dict(nested_dict, - column_levels=( - 'data_split', 'model', 'metric_name'), - row_name='subset'): +def get_df_from_nested_dict( + nested_dict, column_levels=("data_split", "model", "metric_name"), row_name="subset" +): metrics = {} for k, run_metrics in nested_dict.items(): metrics[k] = pimmslearn.pandas.flatten_dict_of_dicts(run_metrics) - metrics = pd.DataFrame.from_dict(metrics, orient='index') + metrics = pd.DataFrame.from_dict(metrics, orient="index") metrics.columns.names = column_levels metrics.index.name = row_name return metrics diff --git a/pimmslearn/models/ae.py b/pimmslearn/models/ae.py index c1a041591..35cd923a6 100644 --- a/pimmslearn/models/ae.py +++ b/pimmslearn/models/ae.py @@ -3,6 +3,7 @@ Variational Autencoder model adapter should be moved to pimmslearn.models.vae. Or model class could be put somewhere else. """ + import logging from typing import List, Union @@ -25,11 +26,13 @@ logger = logging.getLogger(__name__) -def get_preds_from_df(df: pd.DataFrame, - learn: fastai.learner.Learner, - transformer: pimmslearn.transform.VaepPipeline, - position_pred_tuple: int = None, - dataset: torch.utils.data.Dataset = pimmslearn.io.datasets.DatasetWithTarget): +def get_preds_from_df( + df: pd.DataFrame, + learn: fastai.learner.Learner, + transformer: pimmslearn.transform.VaepPipeline, + position_pred_tuple: int = None, + dataset: torch.utils.data.Dataset = pimmslearn.io.datasets.DatasetWithTarget, +): """Get predictions for specified DataFrame, using a fastai learner and a custom sklearn Pipeline. @@ -52,32 +55,31 @@ def get_preds_from_df(df: pd.DataFrame, tuple tuple of pandas DataFrames (prediciton and target) based on learn.get_preds """ - dl = pimmslearn.io.dataloaders.get_test_dl(df=df, - transformer=transformer, - dataset=dataset) + dl = pimmslearn.io.dataloaders.get_test_dl( + df=df, transformer=transformer, dataset=dataset + ) res = learn.get_preds(dl=dl) # -> dl could be int if position_pred_tuple is not None and issubclass(type(res[0]), tuple): res = (res[0][position_pred_tuple], *res[1:]) - res = L(res).map(lambda x: pd.DataFrame( - x, index=df.index, columns=df.columns)) + res = L(res).map(lambda x: pd.DataFrame(x, index=df.index, columns=df.columns)) res = L(res).map(lambda x: transformer.inverse_transform(x)) return res -leaky_relu_default = nn.LeakyReLU(.1) +leaky_relu_default = nn.LeakyReLU(0.1) class Autoencoder(nn.Module): - """Autoencoder base class. - - """ - - def __init__(self, - n_features: int, - n_neurons: Union[int, List[int]], - activation=leaky_relu_default, - last_decoder_activation=None, - dim_latent: int = 10): + """Autoencoder base class.""" + + def __init__( + self, + n_features: int, + n_neurons: Union[int, List[int]], + activation=leaky_relu_default, + last_decoder_activation=None, + dim_latent: int = 10, + ): """Initialize an Autoencoder Parameters @@ -103,18 +105,19 @@ def __init__(self, # define architecture hidden layer def build_layer(in_feat, out_feat): - return [nn.Linear(in_feat, out_feat), - nn.Dropout(0.2), - nn.BatchNorm1d(out_feat), - activation] + return [ + nn.Linear(in_feat, out_feat), + nn.Dropout(0.2), + nn.BatchNorm1d(out_feat), + activation, + ] # Encoder self.encoder = [] for i in range(len(self.layers) - 1): - in_feat, out_feat = self.layers[i:i + 2] - self.encoder.extend(build_layer(in_feat=in_feat, - out_feat=out_feat)) + in_feat, out_feat = self.layers[i : i + 2] + self.encoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) self.encoder.append(nn.Linear(out_feat, dim_latent)) self.encoder = nn.Sequential(*self.encoder) @@ -124,15 +127,13 @@ def build_layer(in_feat, out_feat): assert self.layers_decoder is not self.layers assert out_feat == self.layers_decoder[0] - self.decoder = build_layer(in_feat=self.dim_latent, - out_feat=out_feat) + self.decoder = build_layer(in_feat=self.dim_latent, out_feat=out_feat) i = -1 # in case a single hidden layer is passed for i in range(len(self.layers_decoder) - 2): - in_feat, out_feat = self.layers_decoder[i:i + 2] - self.decoder.extend(build_layer(in_feat=in_feat, - out_feat=out_feat)) - in_feat, out_feat = self.layers_decoder[i + 1:i + 3] + in_feat, out_feat = self.layers_decoder[i : i + 2] + self.decoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) + in_feat, out_feat = self.layers_decoder[i + 1 : i + 3] self.decoder.append(nn.Linear(in_feat, out_feat)) if last_decoder_activation is not None: @@ -145,10 +146,9 @@ def forward(self, x): return x -def get_missing_values(df_train_wide: pd.DataFrame, - val_idx: pd.Index, - test_idx: pd.Index, - pred: pd.Series) -> pd.Series: +def get_missing_values( + df_train_wide: pd.DataFrame, val_idx: pd.Index, test_idx: pd.Index, pred: pd.Series +) -> pd.Series: """Build missing value predictions based on a set of prediction and splits. Parameters @@ -172,11 +172,9 @@ def get_missing_values(df_train_wide: pd.DataFrame, mask = df_train_wide.isna().stack() idx_real_na = mask.index[mask] # remove fake_na idx - idx_real_na = (idx_real_na - .drop(val_idx) - .drop(test_idx)) + idx_real_na = idx_real_na.drop(val_idx).drop(test_idx) pred_real_na = pred.loc[idx_real_na] - pred_real_na.name = 'intensity' + pred_real_na.name = "intensity" return pred_real_na @@ -193,7 +191,8 @@ def after_pred(self): self.learn.yb = (self.y[self.learn._mask],) except IndexError: logger.warn( - f"Mismatch between mask ({self._mask.shape}) and y ({self.y.shape}).") + f"Mismatch between mask ({self._mask.shape}) and y ({self.y.shape})." + ) # self.learn.y = None self.learn.yb = (self.xb[0],) self.learn.yb = (self.learn.xb[0].clone()[self._mask],) @@ -262,23 +261,22 @@ def after_loss(self): class AutoEncoderAnalysis(analysis.ModelAnalysis): - def __init__(self, - train_df: pd.DataFrame, - val_df: pd.DataFrame, # values to use for validation - model: torch.nn.modules.module.Module, - model_kwargs: dict, - transform: sklearn.pipeline.Pipeline, - decode: List[str], - bs=64 - ): + def __init__( + self, + train_df: pd.DataFrame, + val_df: pd.DataFrame, # values to use for validation + model: torch.nn.modules.module.Module, + model_kwargs: dict, + transform: sklearn.pipeline.Pipeline, + decode: List[str], + bs=64, + ): self.transform = pimmslearn.transform.VaepPipeline( - df_train=train_df, - encode=transform, - decode=decode) + df_train=train_df, encode=transform, decode=decode + ) self.dls = pimmslearn.io.dataloaders.get_dls( - train_X=train_df, - valid_X=val_df, - transformer=self.transform, bs=bs) + train_X=train_df, valid_X=val_df, transformer=self.transform, bs=bs + ) # M = data.train_X.shape[-1] self.kwargs_model = model_kwargs @@ -286,13 +284,17 @@ def __init__(self, self.model = model(**self.kwargs_model) self.n_params_ae = pimmslearn.models.calc_net_weight_count(self.model) - self.params['n_parameters'] = self.n_params_ae + self.params["n_parameters"] = self.n_params_ae self.learn = None def get_preds_from_df(self, df_wide: pd.DataFrame) -> pd.DataFrame: if self.learn is None: raise ValueError("Assign Learner first as learn attribute.") - return get_preds_from_df(df=df_wide, learn=self.learn, transformer=self.transform) + return get_preds_from_df( + df=df_wide, learn=self.learn, transformer=self.transform + ) def get_test_dl(self, df_wide: pd.DataFrame, bs: int = 64) -> pd.DataFrame: - return pimmslearn.io.dataloaders.get_test_dl(df=df_wide, transformer=self.transform, bs=bs) + return pimmslearn.io.dataloaders.get_test_dl( + df=df_wide, transformer=self.transform, bs=bs + ) diff --git a/pimmslearn/models/analysis.py b/pimmslearn/models/analysis.py index 570a54b70..6b1318458 100644 --- a/pimmslearn/models/analysis.py +++ b/pimmslearn/models/analysis.py @@ -9,6 +9,7 @@ class ModelAnalysis(Analysis): """Class describing what an ModelAnalysis is supposed to have as attributes.""" + model: torch.nn.Module dls: fastai.data.core.DataLoaders learn: fastai.learner.Learner diff --git a/pimmslearn/models/collab.py b/pimmslearn/models/collab.py index 812495bfc..7712cdb33 100644 --- a/pimmslearn/models/collab.py +++ b/pimmslearn/models/collab.py @@ -1,12 +1,11 @@ - import logging from typing import Tuple import pandas as pd + # import explicit objects for functional annotations from fastai.collab import * -from fastai.collab import (Categorify, IndexSplitter, TabularCollab, - TransformBlock) +from fastai.collab import Categorify, IndexSplitter, TabularCollab, TransformBlock from fastai.tabular.all import * import pimmslearn.io.dataloaders @@ -16,7 +15,9 @@ logger = logging.getLogger(__name__) -def combine_data(train_df: pd.DataFrame, val_df: pd.DataFrame) -> Tuple[pd.DataFrame, float]: +def combine_data( + train_df: pd.DataFrame, val_df: pd.DataFrame +) -> Tuple[pd.DataFrame, float]: """Helper function to combine training and validation data in long-format. The training and validation data will be mixed up in CF training as the sample embeddings have to be trained for all samples. The returned frac can be used to have @@ -42,16 +43,17 @@ def combine_data(train_df: pd.DataFrame, val_df: pd.DataFrame) -> Tuple[pd.DataF class CollabAnalysis(analysis.ModelAnalysis): - def __init__(self, - datasplits: pimmslearn.io.datasplits.DataSplits, - sample_column: str = 'Sample ID', - item_column: str = 'peptide', - target_column: str = 'intensity', - model_kwargs: dict = None, - batch_size: int = 1_024): + def __init__( + self, + datasplits: pimmslearn.io.datasplits.DataSplits, + sample_column: str = "Sample ID", + item_column: str = "peptide", + target_column: str = "intensity", + model_kwargs: dict = None, + batch_size: int = 1_024, + ): if datasplits.val_y is not None: - self.X, _ = combine_data(datasplits.train_X, - datasplits.val_y) + self.X, _ = combine_data(datasplits.train_X, datasplits.val_y) else: self.X, _ = datasplits.train_X.reset_index(), 0.0 self.batch_size = batch_size @@ -62,7 +64,8 @@ def __init__(self, splits = None if datasplits.val_y is not None: idx_splitter = IndexSplitter( - list(range(len(datasplits.train_X), len(self.X)))) + list(range(len(datasplits.train_X), len(self.X))) + ) splits = idx_splitter(self.X) self.to = TabularCollab( self.X, @@ -70,22 +73,25 @@ def __init__(self, cat_names=cat_names, y_names=[rating_name], y_block=TransformBlock(), - splits=splits) - self.dls = self.to.dataloaders(path='.', bs=self.batch_size) + splits=splits, + ) + self.dls = self.to.dataloaders(path=".", bs=self.batch_size) self.params = {} if model_kwargs is None: model_kwargs = {} self.model_kwargs = model_kwargs - self.params['model_kwargs'] = self.model_kwargs + self.params["model_kwargs"] = self.model_kwargs self.transform = None # No data transformation needed self.learn = None -def get_missing_values(df_train_long: pd.DataFrame, - val_idx: pd.Index, - test_idx: pd.Index, - analysis_collab: CollabAnalysis) -> pd.Series: +def get_missing_values( + df_train_long: pd.DataFrame, + val_idx: pd.Index, + test_idx: pd.Index, + analysis_collab: CollabAnalysis, +) -> pd.Series: """Helper function to get missing values from predictions. Excludes simulated missing values from validation and test data. @@ -107,10 +113,8 @@ def get_missing_values(df_train_long: pd.DataFrame, """ mask = df_train_long.unstack().isna().stack() idx_real_na = mask.loc[mask].index - idx_real_na = (idx_real_na - .drop(val_idx) - .drop(test_idx)) + idx_real_na = idx_real_na.drop(val_idx).drop(test_idx) dl_real_na = analysis_collab.dls.test_dl(idx_real_na.to_frame()) pred_real_na, _ = analysis_collab.learn.get_preds(dl=dl_real_na) - pred_real_na = pd.Series(pred_real_na, idx_real_na, name='intensity') + pred_real_na = pd.Series(pred_real_na, idx_real_na, name="intensity") return pred_real_na diff --git a/pimmslearn/models/collect_dumps.py b/pimmslearn/models/collect_dumps.py index f5e0f9440..d48d11007 100644 --- a/pimmslearn/models/collect_dumps.py +++ b/pimmslearn/models/collect_dumps.py @@ -1,4 +1,5 @@ -""" Collects metrics and config files from the experiment directory structure.""" +"""Collects metrics and config files from the experiment directory structure.""" + from functools import partial, update_wrapper import logging from pathlib import Path @@ -13,20 +14,20 @@ def select_content(s: str, first_split): s = s.split(first_split)[1] assert isinstance(s, str), f"More than one split: {s}" - entries = s.split('_') + entries = s.split("_") if len(entries) > 1: - s = '_'.join(entries[:-1]) + s = "_".join(entries[:-1]) return s -def load_config_file(fname: Path, first_split='config_') -> dict: +def load_config_file(fname: Path, first_split="config_") -> dict: with open(fname) as f: loaded = yaml.safe_load(f) key = f"{fname.parent.name}_{select_content(fname.stem, first_split=first_split)}" return key, loaded -def load_metric_file(fname: Path, first_split='metrics_') -> dict: +def load_metric_file(fname: Path, first_split="metrics_") -> dict: with open(fname) as f: loaded = json.load(f) loaded = pimmslearn.pandas.flatten_dict_of_dicts(loaded) @@ -34,26 +35,29 @@ def load_metric_file(fname: Path, first_split='metrics_') -> dict: return key, loaded -def collect(paths: Iterable, - load_fn: Callable[[Path], dict], - ) -> dict: +def collect( + paths: Iterable, + load_fn: Callable[[Path], dict], +) -> dict: all_metrics = [] for fname in paths: fname = Path(fname) key, loaded = load_fn(fname) logger.debug(f"{key = }") - if 'id' not in loaded: - loaded['id'] = key + if "id" not in loaded: + loaded["id"] = key all_metrics.append(loaded) return all_metrics -collect_metrics = partial(collect, - load_fn=load_metric_file, - ) +collect_metrics = partial( + collect, + load_fn=load_metric_file, +) collect_metrics = update_wrapper(collect_metrics, collect) -collect_configs = partial(collect, - load_fn=load_config_file, - ) +collect_configs = partial( + collect, + load_fn=load_config_file, +) collect_configs = update_wrapper(collect_configs, collect) diff --git a/pimmslearn/models/vae.py b/pimmslearn/models/vae.py index 61240137b..6282c686c 100644 --- a/pimmslearn/models/vae.py +++ b/pimmslearn/models/vae.py @@ -6,26 +6,29 @@ - loss is adapted to Dataset and FastAI adaptions - batchnorm1D for now (not weight norm) """ + from typing import List import torch import torch.nn.functional as F from torch import nn -leaky_relu_default = nn.LeakyReLU(.1) +leaky_relu_default = nn.LeakyReLU(0.1) PI = torch.tensor(torch.pi) -log_of_2 = torch.log(torch.tensor(2.)) +log_of_2 = torch.log(torch.tensor(2.0)) class VAE(nn.Module): - def __init__(self, - n_features: int, - n_neurons: List[int], - activation=leaky_relu_default, - # last_encoder_activation=leaky_relu_default, - last_decoder_activation=None, - dim_latent: int = 10): + def __init__( + self, + n_features: int, + n_neurons: List[int], + activation=leaky_relu_default, + # last_encoder_activation=leaky_relu_default, + last_decoder_activation=None, + dim_latent: int = 10, + ): super().__init__() # set up hyperparameters self.n_features, self.n_neurons = n_features, list(n_neurons) @@ -34,18 +37,19 @@ def __init__(self, # define architecture hidden layer def build_layer(in_feat, out_feat): - return [nn.Linear(in_feat, out_feat), - nn.Dropout(0.2), - nn.BatchNorm1d(out_feat), - activation] + return [ + nn.Linear(in_feat, out_feat), + nn.Dropout(0.2), + nn.BatchNorm1d(out_feat), + activation, + ] # Encoder self.encoder = [] for i in range(len(self.layers) - 1): - in_feat, out_feat = self.layers[i:i + 2] - self.encoder.extend(build_layer(in_feat=in_feat, - out_feat=out_feat)) + in_feat, out_feat = self.layers[i : i + 2] + self.encoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) self.encoder.append(nn.Linear(out_feat, dim_latent * 2)) self.encoder = nn.Sequential(*self.encoder) @@ -55,15 +59,13 @@ def build_layer(in_feat, out_feat): assert self.layers_decoder is not self.layers assert out_feat == self.layers_decoder[0] - self.decoder = build_layer(in_feat=self.dim_latent, - out_feat=out_feat) + self.decoder = build_layer(in_feat=self.dim_latent, out_feat=out_feat) i = -1 # in case a single hidden layer is passed for i in range(len(self.layers_decoder) - 2): - in_feat, out_feat = self.layers_decoder[i:i + 2] - self.decoder.extend(build_layer(in_feat=in_feat, - out_feat=out_feat)) - in_feat, out_feat = self.layers_decoder[i + 1:i + 3] + in_feat, out_feat = self.layers_decoder[i : i + 2] + self.decoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) + in_feat, out_feat = self.layers_decoder[i + 1 : i + 3] self.decoder.append(nn.Linear(in_feat, out_feat * 2)) if last_decoder_activation is not None: @@ -73,8 +75,8 @@ def build_layer(in_feat, out_feat): def encode(self, x): z_params = self.encoder(x) - z_mu = z_params[:, :self.dim_latent] - z_logvar = z_params[:, self.dim_latent:] + z_mu = z_params[:, : self.dim_latent] + z_logvar = z_params[:, self.dim_latent :] return z_mu, z_logvar def get_mu_and_logvar(self, x, detach=False): @@ -82,8 +84,8 @@ def get_mu_and_logvar(self, x, detach=False): def decode(self, z): x_params = self.decoder(z) - x_mu = x_params[:, :self.n_features] - x_logvar = x_params[:, self.n_features:] + x_mu = x_params[:, : self.n_features] + x_logvar = x_params[:, self.n_features :] return x_mu, x_logvar def reparameterize(self, mu, logvar): @@ -102,18 +104,21 @@ def compute_kld(z_mu, z_logvar): def gaussian_log_prob(z, mu, logvar): - return -0.5 * (torch.log(2. * PI) + logvar + (z - mu)**2 / torch.exp(logvar)) + return -0.5 * (torch.log(2.0 * PI) + logvar + (z - mu) ** 2 / torch.exp(logvar)) -def loss_fct(pred, y, reduction='sum', results: List = None, freebits=0.1): +def loss_fct(pred, y, reduction="sum", results: List = None, freebits=0.1): x_mu, x_logvar, z_mu, z_logvar = pred batch = y l_rec = -torch.sum(gaussian_log_prob(batch, x_mu, x_logvar)) - l_reg = torch.sum((F.relu(compute_kld(z_mu, z_logvar) - - freebits * log_of_2) - + freebits * log_of_2), - 1) + l_reg = torch.sum( + ( + F.relu(compute_kld(z_mu, z_logvar) - freebits * log_of_2) + + freebits * log_of_2 + ), + 1, + ) if results is not None: results.append((l_rec.item(), torch.mean(l_reg).item())) diff --git a/pimmslearn/nb.py b/pimmslearn/nb.py index 59576086a..9502d23b1 100644 --- a/pimmslearn/nb.py +++ b/pimmslearn/nb.py @@ -5,10 +5,11 @@ import pimmslearn.io import logging + logger = logging.getLogger() -class Config(): +class Config: """Config class with a setter enforcing that config entries cannot be overwritten. @@ -21,8 +22,7 @@ class Config(): def __setattr__(self, entry, value): """Set if attribute not in instance.""" if hasattr(self, entry) and getattr(self, entry) != value: - raise AttributeError( - f'{entry} already set to {getattr(self, entry)}') + raise AttributeError(f"{entry} already set to {getattr(self, entry)}") super().__setattr__(entry, value) def __repr__(self): @@ -36,12 +36,11 @@ def dump(self, fname=None): if fname is None: try: fname = self.out_folder - fname = Path(fname) / 'model_config.yml' + fname = Path(fname) / "model_config.yml" except AttributeError: - raise AttributeError( - 'Specify fname or set "out_folder" attribute.') + raise AttributeError('Specify fname or set "out_folder" attribute.') d = pimmslearn.io.parse_dict(input_dict=self.__dict__) - with open(fname, 'w') as f: + with open(fname, "w") as f: yaml.dump(d, f) logger.info(f"Dumped config to: {fname}") @@ -70,7 +69,7 @@ def values(self): def get_params(args: dict.keys, globals, remove=True) -> dict: - params = {k: v for k, v in globals.items() if k not in args and k[0] != '_'} + params = {k: v for k, v in globals.items() if k not in args and k[0] != "_"} if not remove: return params remove_keys_from_globals(params.keys(), globals=globals) @@ -86,7 +85,7 @@ def remove_keys_from_globals(keys: dict.keys, globals: dict): logger.warning(f"Key not found in globals(): {k}") -def add_default_paths(cfg: Config, folder_data='', out_root=None): +def add_default_paths(cfg: Config, folder_data="", out_root=None): """Add default paths to config.""" if out_root: cfg.out_folder = Path(out_root) @@ -96,26 +95,29 @@ def add_default_paths(cfg: Config, folder_data='', out_root=None): if folder_data: cfg.data = Path(folder_data) else: - cfg.data = cfg.folder_experiment / 'data' + cfg.data = cfg.folder_experiment / "data" cfg.data.mkdir(exist_ok=True, parents=True) assert cfg.data.exists(), f"Directory not found: {cfg.data}" del folder_data - cfg.out_figures = cfg.folder_experiment / 'figures' + cfg.out_figures = cfg.folder_experiment / "figures" cfg.out_figures.mkdir(exist_ok=True) cfg.out_metrics = cfg.folder_experiment cfg.out_metrics.mkdir(exist_ok=True) cfg.out_models = cfg.folder_experiment cfg.out_models.mkdir(exist_ok=True) - cfg.out_preds = cfg.folder_experiment / 'preds' + cfg.out_preds = cfg.folder_experiment / "preds" cfg.out_preds.mkdir(exist_ok=True) return cfg def args_from_dict(args: dict) -> Config: - assert 'folder_experiment' in args, f'Specify "folder_experiment" in {args}.' - args['folder_experiment'] = Path(args['folder_experiment']) + assert "folder_experiment" in args, f'Specify "folder_experiment" in {args}.' + args["folder_experiment"] = Path(args["folder_experiment"]) args = Config().from_dict(args) args.folder_experiment.mkdir(exist_ok=True, parents=True) - add_default_paths(args, folder_data=args.__dict__.get('folder_data', ''), - out_root=args.__dict__.get('out_root', None)) + add_default_paths( + args, + folder_data=args.__dict__.get("folder_data", ""), + out_root=args.__dict__.get("out_root", None), + ) return args diff --git a/pimmslearn/normalization.py b/pimmslearn/normalization.py index 430d8b118..3a479df9a 100644 --- a/pimmslearn/normalization.py +++ b/pimmslearn/normalization.py @@ -26,10 +26,9 @@ def normalize_by_median(df_wide: pd.DataFrame, axis: int = 1) -> pd.DataFrame: return df_wide -def normalize_sceptre(quant: pd.DataFrame, - iter_thresh: float = 1.1, - iter_max: int = 10, - check_convex=True) -> pd.DataFrame: +def normalize_sceptre( + quant: pd.DataFrame, iter_thresh: float = 1.1, iter_max: int = 10, check_convex=True +) -> pd.DataFrame: """Normalize by sample and channel as in SCeptre paper. Code adapted to work with current pandas versions. @@ -76,7 +75,9 @@ def normalize_sceptre(quant: pd.DataFrame, print(f"Median deviation: {median_dev:.2f}") break if i > 0 and check_convex and max_dev_old < max_dev: - raise ValueError("Non-convex behaviour. old max deviation smaller than current.") + raise ValueError( + "Non-convex behaviour. old max deviation smaller than current." + ) print("performed {} iterations, max-dev: {:.2f}".format(i + 1, max_dev)) max_dev_old = max_dev return quant diff --git a/pimmslearn/pandas/__init__.py b/pimmslearn/pandas/__init__.py index fa69cd7af..8531138ee 100644 --- a/pimmslearn/pandas/__init__.py +++ b/pimmslearn/pandas/__init__.py @@ -7,31 +7,31 @@ import omegaconf import pandas as pd -from pimmslearn.pandas.calc_errors import (calc_errors_per_feat, - get_absolute_error) +from pimmslearn.pandas.calc_errors import calc_errors_per_feat, get_absolute_error __all__ = [ - 'calc_errors_per_feat', - 'get_absolute_error', - 'unique_cols', - 'get_unique_non_unique_columns', - 'prop_unique_index', - 'replace_with', - 'index_to_dict', - 'get_columns_accessor', - 'get_columns_accessor_from_iterable', - 'select_max_by', - 'get_columns_namedtuple', - 'highlight_min', - '_add_indices', - 'interpolate', - 'flatten_dict_of_dicts', - 'key_map', - 'parse_query_expression', - 'length', - 'get_last_index_matching_proportion', - 'get_lower_whiskers', - 'get_counts_per_bin'] + "calc_errors_per_feat", + "get_absolute_error", + "unique_cols", + "get_unique_non_unique_columns", + "prop_unique_index", + "replace_with", + "index_to_dict", + "get_columns_accessor", + "get_columns_accessor_from_iterable", + "select_max_by", + "get_columns_namedtuple", + "highlight_min", + "_add_indices", + "interpolate", + "flatten_dict_of_dicts", + "key_map", + "parse_query_expression", + "length", + "get_last_index_matching_proportion", + "get_lower_whiskers", + "get_counts_per_bin", +] def unique_cols(s: pd.Series) -> bool: @@ -80,15 +80,14 @@ def prop_unique_index(df: pd.DataFrame) -> pd.DataFrame: return 1 - prop -def replace_with(string_key: str, replace: str = "()/", replace_with: str = '') -> str: +def replace_with(string_key: str, replace: str = "()/", replace_with: str = "") -> str: for symbol in replace: string_key = string_key.replace(symbol, replace_with) return string_key def index_to_dict(index: pd.Index) -> dict: - cols = {replace_with(col.replace(' ', '_').replace( - '-', '_')): col for col in index} + cols = {replace_with(col.replace(" ", "_").replace("-", "_")): col for col in index} return cols @@ -101,18 +100,20 @@ def get_columns_accessor(df: pd.DataFrame, all_lower_case=False) -> omegaconf.Om return omegaconf.OmegaConf.create(cols) -def get_columns_accessor_from_iterable(cols: Iterable[str], - all_lower_case=False) -> omegaconf.OmegaConf: +def get_columns_accessor_from_iterable( + cols: Iterable[str], all_lower_case=False +) -> omegaconf.OmegaConf: cols = index_to_dict(cols) if all_lower_case: cols = {k.lower(): v for k, v in cols.items()} return omegaconf.OmegaConf.create(cols) -def select_max_by(df: pd.DataFrame, grouping_columns: list, selection_column: str) -> pd.DataFrame: +def select_max_by( + df: pd.DataFrame, grouping_columns: list, selection_column: str +) -> pd.DataFrame: df = df.sort_values(by=[*grouping_columns, selection_column], ascending=False) - df = df.drop_duplicates(subset=grouping_columns, - keep='first') + df = df.drop_duplicates(subset=grouping_columns, keep="first") return df @@ -131,8 +132,8 @@ def get_columns_namedtuple(df: pd.DataFrame) -> namedtuple: NamedTuple instance with columns as attributes. """ columns = df.columns.to_list() - column_keys = [x.replace(' ', '_') for x in columns] - ColumnsNamedTuple = namedtuple('Columns', column_keys) + column_keys = [x.replace(" ", "_") for x in columns] + ColumnsNamedTuple = namedtuple("Columns", column_keys) return ColumnsNamedTuple(**{k: v for k, v in zip(column_keys, columns)}) @@ -151,11 +152,12 @@ def highlight_min(s: pd.Series) -> list: To be used as `pandas.DataFrame.style.apply(highlight_min)` """ to_highlight = s == s.min() - return ['background-color: yellow' if v else '' for v in to_highlight] + return ["background-color: yellow" if v else "" for v in to_highlight] -def _add_indices(array: np.array, original_df: pd.DataFrame, - index_only: bool = False) -> pd.DataFrame: +def _add_indices( + array: np.array, original_df: pd.DataFrame, index_only: bool = False +) -> pd.DataFrame: index = original_df.index columns = None if not index_only: @@ -163,7 +165,7 @@ def _add_indices(array: np.array, original_df: pd.DataFrame, return pd.DataFrame(array, index=index, columns=columns) -def interpolate(wide_df: pd.DataFrame, name='interpolated') -> pd.DataFrame: +def interpolate(wide_df: pd.DataFrame, name="interpolated") -> pd.DataFrame: """Interpolate NA values with the values before and after. Uses n=3 replicates. First rows replicates are the two following. @@ -191,8 +193,7 @@ def interpolate(wide_df: pd.DataFrame, name='interpolated') -> pd.DataFrame: m = last_row.isna() last_row.loc[m] = wide_df.iloc[-3:-1, m.to_list()].mean() - ret = wide_df.interpolate( - method='linear', limit_direction='both', limit=1, axis=0) + ret = wide_df.interpolate(method="linear", limit_direction="both", limit=1, axis=0) ret.iloc[0] = first_row ret.iloc[-1] = last_row @@ -201,7 +202,7 @@ def interpolate(wide_df: pd.DataFrame, name='interpolated') -> pd.DataFrame: return ret -def flatten_dict_of_dicts(d: dict, parent_key: str = '') -> dict: +def flatten_dict_of_dicts(d: dict, parent_key: str = "") -> dict: """Build tuples for nested dictionaries for use as `pandas.MultiIndex`. Parameters @@ -246,11 +247,10 @@ def key_map(d: dict) -> dict: if isinstance(v, dict): ret[k] = key_map(v) else: - _keys = (_keys) + (k, ) + _keys = (_keys) + (k,) if _keys: if ret: - print( - f"Dictionaries are not of the same length: {_keys = } and {ret = }") + print(f"Dictionaries are not of the same length: {_keys = } and {ret = }") for k in _keys: ret[k] = None else: @@ -258,13 +258,13 @@ def key_map(d: dict) -> dict: return ret -printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ' +printable = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ " def parse_query_expression(s: str, printable: str = printable) -> str: """Parse a query expression for pd.DataFrame.query to a file name. Removes all characters not listed in printable.""" - return ''.join(filter(lambda x: x in printable, s)) + return "".join(filter(lambda x: x in printable, s)) def length(x): @@ -276,9 +276,9 @@ def length(x): return 0 -def get_last_index_matching_proportion(df_counts: pd.DataFrame, - prop: float = 0.25, - prop_col: str = 'proportion') -> object: +def get_last_index_matching_proportion( + df_counts: pd.DataFrame, prop: float = 0.25, prop_col: str = "proportion" +) -> object: """df_counts needs to be sorted by "prop_col" (descending). Parameters @@ -304,21 +304,21 @@ def get_last_index_matching_proportion(df_counts: pd.DataFrame, def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series: ret = df.describe() - iqr = ret.loc['75%'] - ret.loc['25%'] - ret = ret.loc['25%'] - iqr * factor + iqr = ret.loc["75%"] - ret.loc["25%"] + ret = ret.loc["25%"] - iqr * factor return ret -def get_counts_per_bin(df: pd.DataFrame, - bins: range, - columns: Optional[List[str]] = None) -> pd.DataFrame: +def get_counts_per_bin( + df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None +) -> pd.DataFrame: """Return counts per bin for selected columns in DataFrame.""" counts_per_bin = dict() if columns is None: columns = df.columns.to_list() for col in columns: - _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size()) - _series.index.name = 'bin' + _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size() + _series.index.name = "bin" counts_per_bin[col] = _series counts_per_bin = pd.DataFrame(counts_per_bin) return counts_per_bin diff --git a/pimmslearn/pandas/calc_errors.py b/pimmslearn/pandas/calc_errors.py index c92de80d7..76b9e13ae 100644 --- a/pimmslearn/pandas/calc_errors.py +++ b/pimmslearn/pandas/calc_errors.py @@ -1,29 +1,30 @@ import pandas as pd -def calc_errors_per_feat(pred: pd.DataFrame, freq_feat: pd.Series, target_col='observed') -> pd.DataFrame: +def calc_errors_per_feat( + pred: pd.DataFrame, freq_feat: pd.Series, target_col="observed" +) -> pd.DataFrame: """Calculate absolute errors and sort by freq of features.""" - n_obs = pred.groupby(freq_feat.index.name)[ - target_col].count().rename('n_obs') + n_obs = pred.groupby(freq_feat.index.name)[target_col].count().rename("n_obs") errors = pred.drop(target_col, axis=1).sub(pred[target_col], axis=0) errors = errors.abs().groupby(freq_feat.index.name).mean() # absolute error errors = errors.join(freq_feat).join(n_obs) errors = errors.sort_values(by=freq_feat.name, ascending=True) - errors.columns.name = 'model' + errors.columns.name = "model" return errors -def calc_errors_per_bin(pred: pd.DataFrame, target_col='observed') -> pd.DataFrame: +def calc_errors_per_bin(pred: pd.DataFrame, target_col="observed") -> pd.DataFrame: """Calculate absolute errors. Bin by integer value of simulated NA and provide count.""" errors = get_absolute_error(pred, y_true=target_col) - errors['bin'] = pred[target_col].astype(int) # integer bin of simulated NA - n_obs = errors.groupby('bin').size().rename('n_obs') - errors = errors.join(n_obs, on='bin') - errors = errors.sort_values(by='n_obs', ascending=True) - errors.columns.name = 'model' + errors["bin"] = pred[target_col].astype(int) # integer bin of simulated NA + n_obs = errors.groupby("bin").size().rename("n_obs") + errors = errors.join(n_obs, on="bin") + errors = errors.sort_values(by="n_obs", ascending=True) + errors.columns.name = "model" return errors -def get_absolute_error(pred: pd.DataFrame, y_true: str = 'observed') -> pd.DataFrame: +def get_absolute_error(pred: pd.DataFrame, y_true: str = "observed") -> pd.DataFrame: errors = pred.drop(y_true, axis=1).sub(pred[y_true], axis=0) return errors.abs() diff --git a/pimmslearn/pandas/missing_data.py b/pimmslearn/pandas/missing_data.py index b4a3b97ec..c07cde7f8 100644 --- a/pimmslearn/pandas/missing_data.py +++ b/pimmslearn/pandas/missing_data.py @@ -1,4 +1,5 @@ """Functionality related to analyzing missing values in a pandas DataFrame.""" + from __future__ import annotations import math @@ -28,7 +29,7 @@ def percent_non_missing(df: pd.DataFrame) -> float: return df.notna().sum().sum() / math.prod(df.shape) -def list_files(folder='.') -> list[str]: +def list_files(folder=".") -> list[str]: return [f.as_posix() for f in Path(folder).iterdir()] @@ -41,17 +42,19 @@ def get_record(data: pd.DataFrame, columns_sample=False) -> dict: N_obs = data.notna().sum().sum() N_mis = N * M - N_obs missing = N_mis / (N_obs + N_mis) - record = dict(N=int(N), - M=int(M), - N_obs=int(N_obs), - N_mis=int(N_mis), - missing=float(missing), ) + record = dict( + N=int(N), + M=int(M), + N_obs=int(N_obs), + N_mis=int(N_mis), + missing=float(missing), + ) return record -def decompose_NAs(data: pd.DataFrame, - level: Union[int, str], - label: int = 'summary') -> pd.DataFrame: +def decompose_NAs( + data: pd.DataFrame, level: Union[int, str], label: int = "summary" +) -> pd.DataFrame: """Decompose missing values by a level into real and indirectly imputed missing values. Real missing value have missing for all samples in a group. Indirectly imputed missing values are in MS-based proteomics data that would be imputed by the mean (or median) of the observed @@ -86,16 +89,22 @@ def decompose_NAs(data: pd.DataFrame, M = len(_df) # normally 2 or 3 _real_mvs = _df.isna().all(axis=0).sum() * M real_mvs += _real_mvs - ii_mvs += (total_NAs - _real_mvs) + ii_mvs += total_NAs - _real_mvs else: ValueError("Something went wrong") assert data.isna().sum().sum() == real_mvs + ii_mvs - return pd.Series( - {'total_obs': data.notna().sum().sum(), - 'total_MVs': data.isna().sum().sum(), - 'real_MVs': real_mvs, - 'indirectly_imputed_MVs': ii_mvs, - 'real_MVs_ratio': real_mvs / data.isna().sum().sum(), - 'indirectly_imputed_MVs_ratio': ii_mvs / data.isna().sum().sum(), - 'total_MVs_ratio': data.isna().sum().sum() / data.size - }).to_frame(name=label).T.convert_dtypes() + return ( + pd.Series( + { + "total_obs": data.notna().sum().sum(), + "total_MVs": data.isna().sum().sum(), + "real_MVs": real_mvs, + "indirectly_imputed_MVs": ii_mvs, + "real_MVs_ratio": real_mvs / data.isna().sum().sum(), + "indirectly_imputed_MVs_ratio": ii_mvs / data.isna().sum().sum(), + "total_MVs_ratio": data.isna().sum().sum() / data.size, + } + ) + .to_frame(name=label) + .T.convert_dtypes() + ) diff --git a/pimmslearn/plotting/__init__.py b/pimmslearn/plotting/__init__.py index 81136dc63..9cf52007d 100644 --- a/pimmslearn/plotting/__init__.py +++ b/pimmslearn/plotting/__init__.py @@ -17,35 +17,39 @@ seaborn.set_style("whitegrid") # seaborn.set_theme() -plt.rcParams['figure.figsize'] = [16.0, 7.0] # [4, 2], [4, 3] -plt.rcParams['pdf.fonttype'] = 42 -plt.rcParams['ps.fonttype'] = 42 +plt.rcParams["figure.figsize"] = [16.0, 7.0] # [4, 2], [4, 3] +plt.rcParams["pdf.fonttype"] = 42 +plt.rcParams["ps.fonttype"] = 42 -plt.rcParams['figure.dpi'] = 147 +plt.rcParams["figure.dpi"] = 147 logger = logging.getLogger(__name__) -__all__ = ['plotly', - 'data', - 'defaults', - 'errors', - 'plot_rolling_error', - # define in this file - 'savefig', - 'select_xticks', - 'select_dates', - 'make_large_descriptors', - 'plot_feat_counts', - 'plot_cutoffs', - ] - - -def _savefig(fig, name, folder: pathlib.Path = '.', - pdf=True, - dpi=300, # default 'figure', - tight_layout=True, - ): +__all__ = [ + "plotly", + "data", + "defaults", + "errors", + "plot_rolling_error", + # define in this file + "savefig", + "select_xticks", + "select_dates", + "make_large_descriptors", + "plot_feat_counts", + "plot_cutoffs", +] + + +def _savefig( + fig, + name, + folder: pathlib.Path = ".", + pdf=True, + dpi=300, # default 'figure', + tight_layout=True, +): """Save matplotlib Figure (having method `savefig`) as pdf and png.""" folder = pathlib.Path(folder) fname = folder / name @@ -53,9 +57,9 @@ def _savefig(fig, name, folder: pathlib.Path = '.', folder.mkdir(exist_ok=True, parents=True) if tight_layout: fig.tight_layout() - fig.savefig(fname.with_suffix('.png'), dpi=dpi) + fig.savefig(fname.with_suffix(".png"), dpi=dpi) if pdf: - fig.savefig(fname.with_suffix('.pdf'), dpi=dpi) + fig.savefig(fname.with_suffix(".pdf"), dpi=dpi) logger.info(f"Saved Figures to {fname}") @@ -109,27 +113,34 @@ def select_dates(date_series: pd.Series, max_ticks=30) -> np.array: xticks -def make_large_descriptors(size='xx-large'): +def make_large_descriptors(size="xx-large"): """Helper function to have very large titles, labes and tick texts for matplotlib plots per default. size: str fontsize or allowed category. Change default if necessary, default 'xx-large' """ - plt.rcParams.update({k: size for k in ['xtick.labelsize', - 'ytick.labelsize', - 'axes.titlesize', - 'axes.labelsize', - 'legend.fontsize', - 'legend.title_fontsize'] - }) + plt.rcParams.update( + { + k: size + for k in [ + "xtick.labelsize", + "ytick.labelsize", + "axes.titlesize", + "axes.labelsize", + "legend.fontsize", + "legend.title_fontsize", + ] + } + ) set_font_sizes = make_large_descriptors -def add_prop_as_second_yaxis(ax: matplotlib.axes.Axes, n_samples: int, - format_str: str = '{x:,.3f}') -> matplotlib.axes.Axes: +def add_prop_as_second_yaxis( + ax: matplotlib.axes.Axes, n_samples: int, format_str: str = "{x:,.3f}" +) -> matplotlib.axes.Axes: """Add proportion as second axis. Try to align cleverly Parameters @@ -149,42 +160,41 @@ def add_prop_as_second_yaxis(ax: matplotlib.axes.Axes, n_samples: int, logger.info(f"{n_min = }, {n_max = }") lower_prop = n_min / n_samples + (ax.get_ybound()[0] - n_min) / n_samples upper_prop = n_max / n_samples + (ax.get_ybound()[1] - n_max) / n_samples - logger.info(f'{lower_prop = }, {upper_prop = }') + logger.info(f"{lower_prop = }, {upper_prop = }") ax2.set_ybound(lower_prop, upper_prop) # _ = ax2.set_yticks(np.linspace(n_min/n_samples, # n_max /n_samples, len(ax.get_yticks())-2)) _ = ax2.set_yticks(ax.get_yticks()[1:-1] / n_samples) - ax2.yaxis.set_major_formatter( - matplotlib.ticker.StrMethodFormatter(format_str)) + ax2.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter(format_str)) return ax2 def add_height_to_barplot(ax, size=5, rotated=False): - ax.annotate = partial(ax.annotate, text='NA', - xytext=(0, int(size / 2)), - ha='center', - size=size, - textcoords='offset points') - ax.annotate = partial(ax.annotate, - rotation=0, - va='center') + ax.annotate = partial( + ax.annotate, + text="NA", + xytext=(0, int(size / 2)), + ha="center", + size=size, + textcoords="offset points", + ) + ax.annotate = partial(ax.annotate, rotation=0, va="center") if rotated: - ax.annotate = partial(ax.annotate, - xytext=(1, int(size / 3)), - rotation=90, - va='bottom') + ax.annotate = partial( + ax.annotate, xytext=(1, int(size / 3)), rotation=90, va="bottom" + ) for bar in ax.patches: if not bar.get_height(): - xy = (bar.get_x() + bar.get_width() / 2, - 0.0) - ax.annotate(text='NA', - xy=xy, - ) + xy = (bar.get_x() + bar.get_width() / 2, 0.0) + ax.annotate( + text="NA", + xy=xy, + ) continue - ax.annotate(text=format(bar.get_height(), '.2f'), - xy=(bar.get_x() + bar.get_width() / 2, - bar.get_height()), - ) + ax.annotate( + text=format(bar.get_height(), ".2f"), + xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), + ) return ax @@ -193,20 +203,22 @@ def add_text_to_barplot(ax, text, size=5): logger.debug(f"{bar = }, f{text = }, {bar.get_height() = }") if not bar.get_height(): continue - ax.annotate(text=text_, - xy=(bar.get_x() + bar.get_width() / 2, - bar.get_height()), - xytext=(1, -5), - rotation=90, - ha='center', - va='top', - size=size, - textcoords='offset points') + ax.annotate( + text=text_, + xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), + xytext=(1, -5), + rotation=90, + ha="center", + va="top", + size=size, + textcoords="offset points", + ) return ax -def format_large_numbers(ax: matplotlib.axes.Axes, - format_str: str = '{x:,.0f}') -> matplotlib.axes.Axes: +def format_large_numbers( + ax: matplotlib.axes.Axes, format_str: str = "{x:,.0f}" +) -> matplotlib.axes.Axes: """Format large integer numbers to be read more easily. Parameters @@ -221,82 +233,93 @@ def format_large_numbers(ax: matplotlib.axes.Axes, matplotlib.axes.Axes _description_ """ - ax.xaxis.set_major_formatter( - matplotlib.ticker.StrMethodFormatter(format_str)) - ax.yaxis.set_major_formatter( - matplotlib.ticker.StrMethodFormatter(format_str)) + ax.xaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter(format_str)) + ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter(format_str)) return ax -def plot_feat_counts(df_counts: pd.DataFrame, feat_name: str, n_samples: int, - ax=None, figsize=(15, 10), - count_col='counts', - **kwargs): +def plot_feat_counts( + df_counts: pd.DataFrame, + feat_name: str, + n_samples: int, + ax=None, + figsize=(15, 10), + count_col="counts", + **kwargs, +): args = dict( - ylabel='count', - xlabel=f'{feat_name} ordered by completeness', - title=f'Count and proportion of {len(df_counts):,d} {feat_name}s over {n_samples:,d} samples', + ylabel="count", + xlabel=f"{feat_name} ordered by completeness", + title=f"Count and proportion of {len(df_counts):,d} {feat_name}s over {n_samples:,d} samples", ) args.update(kwargs) - ax = df_counts[count_col].plot( - figsize=figsize, - - grid=True, - ax=ax, - **args) + ax = df_counts[count_col].plot(figsize=figsize, grid=True, ax=ax, **args) # default nearly okay, but rather customize to see minimal and maxium proportion # ax = peptide_counts['proportion'].plot(secondary_y=True, style='b') ax2 = add_prop_as_second_yaxis(ax=ax, n_samples=n_samples) - ax2.set_ylabel('proportion') + ax2.set_ylabel("proportion") ax = format_large_numbers(ax=ax) return ax -def plot_counts(df_counts: pd.DataFrame, n_samples, - feat_col_name: str = 'count', - feature_name=None, - ax=None, prop_feat=0.25, min_feat_prop=.01, - **kwargs): +def plot_counts( + df_counts: pd.DataFrame, + n_samples, + feat_col_name: str = "count", + feature_name=None, + ax=None, + prop_feat=0.25, + min_feat_prop=0.01, + **kwargs, +): """Plot counts based on get_df_counts.""" if feature_name is None: feature_name = feat_col_name # df_counts = df_counts[[feat_col_name]].copy() - ax = plot_feat_counts(df_counts, - feat_name=feature_name, - n_samples=n_samples, - count_col=feat_col_name, - ax=ax, **kwargs) - df_counts['prop'] = df_counts[feat_col_name] / n_samples + ax = plot_feat_counts( + df_counts, + feat_name=feature_name, + n_samples=n_samples, + count_col=feat_col_name, + ax=ax, + **kwargs, + ) + df_counts["prop"] = df_counts[feat_col_name] / n_samples n_feat_cutoff = pimmslearn.pandas.get_last_index_matching_proportion( - df_counts=df_counts, prop=prop_feat, prop_col='prop') + df_counts=df_counts, prop=prop_feat, prop_col="prop" + ) n_samples_cutoff = df_counts.loc[n_feat_cutoff, feat_col_name] - logger.info(f'{n_feat_cutoff = }, {n_samples_cutoff = }') + logger.info(f"{n_feat_cutoff = }, {n_samples_cutoff = }") x_lim_max = pimmslearn.pandas.get_last_index_matching_proportion( - df_counts, min_feat_prop, prop_col='prop') - logger.info(f'{x_lim_max = }') + df_counts, min_feat_prop, prop_col="prop" + ) + logger.info(f"{x_lim_max = }") ax.set_xlim(-1, x_lim_max) - ax.axvline(n_feat_cutoff, c='red') + ax.axvline(n_feat_cutoff, c="red") # ax.text(n_feat_cutoff + 0.03 * x_lim_max, # n_samples_cutoff, '25% cutoff', # style='italic', fontsize=12, # bbox={'facecolor': 'grey', 'alpha': 0.5, 'pad': 10}) - ax.annotate(f'{prop_feat*100}% cutoff', - xy=(n_feat_cutoff, n_samples_cutoff), - xytext=(n_feat_cutoff + 0.1 * x_lim_max, n_samples_cutoff), - fontsize=16, - arrowprops=dict(facecolor='black', shrink=0.05)) + ax.annotate( + f"{prop_feat*100}% cutoff", + xy=(n_feat_cutoff, n_samples_cutoff), + xytext=(n_feat_cutoff + 0.1 * x_lim_max, n_samples_cutoff), + fontsize=16, + arrowprops=dict(facecolor="black", shrink=0.05), + ) return ax -def plot_cutoffs(df: pd.DataFrame, - feat_completness_over_samples: int = None, - min_feat_in_sample: int = None - ) -> tuple[matplotlib.figure.Figure, np.array[matplotlib.axes.Axes]]: +def plot_cutoffs( + df: pd.DataFrame, + feat_completness_over_samples: int = None, + min_feat_in_sample: int = None, +) -> tuple[matplotlib.figure.Figure, np.array[matplotlib.axes.Axes]]: """plot number of available features along index and columns (feat vs samples), potentially including some cutoff. @@ -317,13 +340,15 @@ def plot_cutoffs(df: pd.DataFrame, notna = df.notna() fig, axes = plt.subplots(1, 2) ax = axes[0] - notna.sum(axis=0).sort_values().plot(rot=90, ax=ax, - ylabel='count samples', xlabel='feature name') + notna.sum(axis=0).sort_values().plot( + rot=90, ax=ax, ylabel="count samples", xlabel="feature name" + ) if feat_completness_over_samples is not None: ax.axhline(feat_completness_over_samples) ax = axes[1] - notna.sum(axis=1).sort_values().plot(rot=90, ax=ax, - ylabel='count features', xlabel='sample name') + notna.sum(axis=1).sort_values().plot( + rot=90, ax=ax, ylabel="count features", xlabel="sample name" + ) if min_feat_in_sample is not None: ax.axhline(min_feat_in_sample) return fig, axes @@ -340,15 +365,14 @@ def only_every_x_ticks(ax, x=2, axis=None): elif axis == 1: ax.set_yticks(ax.get_yticks()[::x]) else: - raise ValueError(f'axis must be 0 or 1, got {axis}') + raise ValueError(f"axis must be 0 or 1, got {axis}") return ax def use_first_n_chars_in_labels(ax, x=2): """Take first N characters of labels and use them as new labels""" # xaxis - _new_labels = [_l.get_text()[:x] - for _l in ax.get_xticklabels()] + _new_labels = [_l.get_text()[:x] for _l in ax.get_xticklabels()] _ = ax.set_xticklabels(_new_labels) # yaxis _new_labels = [_l.get_text()[:x] for _l in ax.get_yticklabels()] @@ -356,10 +380,11 @@ def use_first_n_chars_in_labels(ax, x=2): return ax -def split_xticklabels(ax, PG_SEPARATOR=';'): +def split_xticklabels(ax, PG_SEPARATOR=";"): """Split labels by PG_SEPARATOR and only use first part""" if PG_SEPARATOR is not None: - _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] - for _l in ax.get_xticklabels()] + _new_labels = [ + _l.get_text().split(PG_SEPARATOR)[0] for _l in ax.get_xticklabels() + ] _ = ax.set_xticklabels(_new_labels) return ax diff --git a/pimmslearn/plotting/data.py b/pimmslearn/plotting/data.py index 1051a1a43..5a471b3e8 100644 --- a/pimmslearn/plotting/data.py +++ b/pimmslearn/plotting/data.py @@ -1,4 +1,5 @@ """Plot data distribution based on pandas `DataFrames` or `Series`.""" + import logging from typing import Iterable, Optional, Tuple, Union @@ -30,40 +31,33 @@ def min_max(s: pd.Series) -> Tuple[int]: def get_min_max_iterable(series: Iterable[pd.Series]) -> Tuple[int]: """Get the min and max as integer from an iterable of pandas.Series.""" - min_bin = int( - min( - (s.min() for s in series)) - ) - max_bin = int( - max( - s.max() for s in series) - ) + min_bin = int(min((s.min() for s in series))) + max_bin = int(max(s.max() for s in series)) return min_bin, max_bin -def plot_histogram_intensities(s: pd.Series, - interval_bins=1, - min_max: Tuple[int] = None, - ax=None, - **kwargs) -> Tuple[Axes, range]: +def plot_histogram_intensities( + s: pd.Series, interval_bins=1, min_max: Tuple[int] = None, ax=None, **kwargs +) -> Tuple[Axes, range]: """Plot intensities in Series in a certain range and equally spaced intervals.""" if min_max is None: min_max = get_min_max_iterable([s]) min_bin, max_bin = min_max bins = range(min_bin, max_bin, interval_bins) - ax = s.plot.hist(bins=bins, xticks=list(bins), - ax=ax, **kwargs) + ax = s.plot.hist(bins=bins, xticks=list(bins), ax=ax, **kwargs) ax.yaxis.set_major_formatter("{x:,.0f}") return ax, bins -def plot_observations(df: pd.DataFrame, - ax: Axes = None, - title: str = '', - axis: int = 1, - size: int = 1, - ylabel: str = 'Frequency', - xlabel: Optional[str] = None) -> Axes: +def plot_observations( + df: pd.DataFrame, + ax: Axes = None, + title: str = "", + axis: int = 1, + size: int = 1, + ylabel: str = "Frequency", + xlabel: Optional[str] = None, +) -> Axes: """Plot non missing observations by row (axis=1) or column (axis=0) in order of number of available observations. No binning is applied, only counts of non-missing values are plotted. @@ -90,31 +84,27 @@ def plot_observations(df: pd.DataFrame, """ if xlabel is None: if df.columns.name: - xlabel = f'Samples ordered by identified {df.columns.name}' + xlabel = f"Samples ordered by identified {df.columns.name}" else: - xlabel = 'Samples ordered by identified features' + xlabel = "Samples ordered by identified features" - ax = (df - .notna() - .sum(axis=axis) - .sort_values() - .reset_index(drop=True) - .plot( - ax=ax, - style='.', - ms=size, - title=title, - ylabel=ylabel, - xlabel=xlabel) - ) - ax.locator_params(axis='y', integer=True) + ax = ( + df.notna() + .sum(axis=axis) + .sort_values() + .reset_index(drop=True) + .plot(ax=ax, style=".", ms=size, title=title, ylabel=ylabel, xlabel=xlabel) + ) + ax.locator_params(axis="y", integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") return ax -def plot_missing_dist_highdim(data: pd.DataFrame, - min_feat_per_sample: int = None, - min_samples_per_feat: int = None) -> matplotlib.figure.Figure: +def plot_missing_dist_highdim( + data: pd.DataFrame, + min_feat_per_sample: int = None, + min_samples_per_feat: int = None, +) -> matplotlib.figure.Figure: """Plot missing distribution (cdf) in high dimensional data. Parameters @@ -133,137 +123,141 @@ def plot_missing_dist_highdim(data: pd.DataFrame, """ fig, axes = plt.subplots(1, 2, figsize=(4, 2)) not_na = data.notna() - name = 'features per sample' - ax = (not_na - .sum(axis=1) - .to_frame(name) - .groupby(name) - .size() - .sort_index() - .plot - .line(style='.', - ax=axes[0]) - ) - ax.set_ylabel('observations (samples)') + name = "features per sample" + ax = ( + not_na.sum(axis=1) + .to_frame(name) + .groupby(name) + .size() + .sort_index() + .plot.line(style=".", ax=axes[0]) + ) + ax.set_ylabel("observations (samples)") if min_feat_per_sample is not None: - ax.vlines(min_feat_per_sample, *ax.get_ylim(), color='red') - ax.locator_params(axis='y', integer=True) + ax.vlines(min_feat_per_sample, *ax.get_ylim(), color="red") + ax.locator_params(axis="y", integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") - name = 'samples per feature' - ax = (not_na - .sum(axis=0) - .to_frame(name) - .groupby(name) - .size() - .sort_index() - .plot - .line(style='.', - ax=axes[1]) - ) + name = "samples per feature" + ax = ( + not_na.sum(axis=0) + .to_frame(name) + .groupby(name) + .size() + .sort_index() + .plot.line(style=".", ax=axes[1]) + ) if min_samples_per_feat is not None: - ax.vlines(min_samples_per_feat, *ax.get_ylim(), color='red') - ax.locator_params(axis='y', integer=True) + ax.vlines(min_samples_per_feat, *ax.get_ylim(), color="red") + ax.locator_params(axis="y", integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") - ax.set_ylabel('observations (features)') + ax.set_ylabel("observations (features)") fig.tight_layout() return fig -def plot_missing_dist_boxplots(data: pd.DataFrame, - min_feat_per_sample=None, - min_samples_per_feat=None) -> matplotlib.figure.Figure: +def plot_missing_dist_boxplots( + data: pd.DataFrame, min_feat_per_sample=None, min_samples_per_feat=None +) -> matplotlib.figure.Figure: fig, axes = plt.subplots(1, 2, figsize=(4, 2)) not_na = data.notna() - idx_label, col_label = 'feature', 'sample' + idx_label, col_label = "feature", "sample" if data.index.name: idx_label = data.index.name if data.columns.name: col_label = data.columns.name - ax = (not_na - .sum(axis=1) - .rename(f'observation ({idx_label}) per {col_label}') - .plot - .box(ax=axes[0]) - ) + ax = ( + not_na.sum(axis=1) + .rename(f"observation ({idx_label}) per {col_label}") + .plot.box(ax=axes[0]) + ) if min_feat_per_sample is not None: - ax.hlines(min_feat_per_sample, *ax.get_xlim(), color='red') - ax.locator_params(axis='y', integer=True) + ax.hlines(min_feat_per_sample, *ax.get_xlim(), color="red") + ax.locator_params(axis="y", integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") - ax = (not_na - .sum(axis=0) - .rename(f'observation ({idx_label}) per {col_label}') - .plot - .box(ax=axes[1]) - ) + ax = ( + not_na.sum(axis=0) + .rename(f"observation ({idx_label}) per {col_label}") + .plot.box(ax=axes[1]) + ) if min_samples_per_feat is not None: - ax.hlines(min_samples_per_feat, *ax.get_xlim(), color='red') - ax.locator_params(axis='y', integer=True) + ax.hlines(min_samples_per_feat, *ax.get_xlim(), color="red") + ax.locator_params(axis="y", integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") return fig -def plot_missing_pattern_violinplot(data: pd.DataFrame, - min_feat_per_sample=None, - min_samples_per_feat=None) -> matplotlib.figure.Figure: +def plot_missing_pattern_violinplot( + data: pd.DataFrame, min_feat_per_sample=None, min_samples_per_feat=None +) -> matplotlib.figure.Figure: fig, axes = plt.subplots(1, 2, figsize=(4, 2)) not_na = data.notna() - name = 'features per sample' + name = "features per sample" ax = sns.violinplot( data=not_na.sum(axis=1).to_frame(name), ax=axes[0], ) if min_feat_per_sample is not None: - ax.hlines(min_feat_per_sample, *ax.get_xlim(), color='red') - ax.locator_params(axis='y', integer=True) + ax.hlines(min_feat_per_sample, *ax.get_xlim(), color="red") + ax.locator_params(axis="y", integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") - ax.set_ylabel('observations (features)') - name = 'samples per feature' + ax.set_ylabel("observations (features)") + name = "samples per feature" ax = sns.violinplot( data=not_na.sum(axis=0).to_frame(name), ax=axes[1], ) if min_samples_per_feat is not None: - ax.hlines(min_samples_per_feat, *ax.get_xlim(), color='red') - ax.locator_params(axis='y', integer=True) + ax.hlines(min_samples_per_feat, *ax.get_xlim(), color="red") + ax.locator_params(axis="y", integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") - ax.set_ylabel('observations (samples)') + ax.set_ylabel("observations (samples)") fig.tight_layout() return fig -def plot_missing_pattern_histogram(data: pd.DataFrame, - bins: int = 20, - min_feat_per_sample=None, - min_samples_per_feat=None,) -> matplotlib.figure.Figure: +def plot_missing_pattern_histogram( + data: pd.DataFrame, + bins: int = 20, + min_feat_per_sample=None, + min_samples_per_feat=None, +) -> matplotlib.figure.Figure: fig, axes = plt.subplots(1, 2, figsize=(4, 2)) not_na = data.notna() - idx_label, col_label = 'sample', 'feature' + idx_label, col_label = "sample", "feature" if data.index.name: idx_label = data.index.name if data.columns.name: col_label = data.columns.name - name = f'observations per {idx_label}' - ax = not_na.sum(axis=1).to_frame(name).plot.hist( - ax=axes[0], - bins=bins, - legend=False, + name = f"observations per {idx_label}" + ax = ( + not_na.sum(axis=1) + .to_frame(name) + .plot.hist( + ax=axes[0], + bins=bins, + legend=False, + ) ) if min_feat_per_sample is not None: - ax.vlines(min_feat_per_sample, *ax.get_ylim(), color='red') - ax.locator_params(axis='y', integer=True) + ax.vlines(min_feat_per_sample, *ax.get_ylim(), color="red") + ax.locator_params(axis="y", integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") ax.set_xlabel(name) - ax.set_ylabel('observations in bin') + ax.set_ylabel("observations in bin") # second - name = f'observations per {col_label}' - ax = data = not_na.sum(axis=0).to_frame(name).plot.hist( - ax=axes[1], - bins=bins, - legend=False, + name = f"observations per {col_label}" + ax = data = ( + not_na.sum(axis=0) + .to_frame(name) + .plot.hist( + ax=axes[1], + bins=bins, + legend=False, + ) ) if min_samples_per_feat is not None: - ax.vlines(min_samples_per_feat, *ax.get_ylim(), color='red') - ax.locator_params(axis='y', integer=True) + ax.vlines(min_samples_per_feat, *ax.get_ylim(), color="red") + ax.locator_params(axis="y", integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") ax.set_xlabel(name) ax.set_ylabel(None) @@ -271,53 +265,55 @@ def plot_missing_pattern_histogram(data: pd.DataFrame, return fig -def plot_feat_median_over_prop_missing(data: pd.DataFrame, - type: str = 'scatter', - ax: matplotlib.axes.Axes = None, - s: int = 1, - return_plot_data: bool = False - ) -> Union[matplotlib.axes.Axes, - Tuple[matplotlib.axes.Axes, pd.DataFrame]]: +def plot_feat_median_over_prop_missing( + data: pd.DataFrame, + type: str = "scatter", + ax: matplotlib.axes.Axes = None, + s: int = 1, + return_plot_data: bool = False, +) -> Union[matplotlib.axes.Axes, Tuple[matplotlib.axes.Axes, pd.DataFrame]]: """Plot feature median over proportion missing in that feature. Sorted by feature median into bins.""" - y_col = 'prop. missing' - x_col = 'Features binned by their median intensity (N features)' + y_col = "prop. missing" + x_col = "Features binned by their median intensity (N features)" - missing_by_median = { - 'median feat value': data.median(), - y_col: data.isna().mean()} + missing_by_median = {"median feat value": data.median(), y_col: data.isna().mean()} missing_by_median = pd.DataFrame(missing_by_median) - bins = range( - *min_max(missing_by_median['median feat value']), 1) + bins = range(*min_max(missing_by_median["median feat value"]), 1) - missing_by_median['bins'] = pd.cut( - missing_by_median['median feat value'], bins=bins) - missing_by_median['median feat value (floor)'] = (missing_by_median['median feat value'] - .astype(int) - ) - _counts = (missing_by_median - .groupby('median feat value (floor)')['median feat value'] - .count() - .rename('count')) - missing_by_median = missing_by_median.join( - _counts, on='median feat value (floor)') - missing_by_median = missing_by_median.sort_values( - 'median feat value (floor)') - missing_by_median[x_col] = (missing_by_median.iloc[:, -2:] - .apply(lambda s: "{:02,d} (N={:3,d})".format(*s), axis=1) - ) - if type == 'scatter': - ax = missing_by_median.plot.scatter(x_col, y_col, - ylim=(-.03, 1.03), - ax=ax, - s=s,) + missing_by_median["bins"] = pd.cut( + missing_by_median["median feat value"], bins=bins + ) + missing_by_median["median feat value (floor)"] = missing_by_median[ + "median feat value" + ].astype(int) + _counts = ( + missing_by_median.groupby("median feat value (floor)")["median feat value"] + .count() + .rename("count") + ) + missing_by_median = missing_by_median.join(_counts, on="median feat value (floor)") + missing_by_median = missing_by_median.sort_values("median feat value (floor)") + missing_by_median[x_col] = missing_by_median.iloc[:, -2:].apply( + lambda s: "{:02,d} (N={:3,d})".format(*s), axis=1 + ) + if type == "scatter": + ax = missing_by_median.plot.scatter( + x_col, + y_col, + ylim=(-0.03, 1.03), + ax=ax, + s=s, + ) # # for some reason this does not work as it does elswhere: # _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45) # # do it manually: - _ = [(_l.set_rotation(45), _l.set_horizontalalignment('right')) - for _l in ax.get_xticklabels()] - elif type == 'boxplot': + _ = [ + (_l.set_rotation(45), _l.set_horizontalalignment("right")) + for _l in ax.get_xticklabels() + ] + elif type == "boxplot": ax = missing_by_median[[x_col, y_col]].plot.box( by=x_col, boxprops=dict(linewidth=s), @@ -325,15 +321,15 @@ def plot_feat_median_over_prop_missing(data: pd.DataFrame, ax=ax, ) ax = ax[0] # returned series due to by argument? - _ = ax.set_title('') + _ = ax.set_title("") _ = ax.set_ylabel(y_col) - _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45, - horizontalalignment='right') + _ = ax.set_xticklabels( + ax.get_xticklabels(), rotation=45, horizontalalignment="right" + ) _ = ax.set_xlabel(x_col) _ = ax.set_ylim(-0.03, 1.03) else: - raise ValueError( - f'Unknown plot type: {type}, choose from: scatter, boxplot') + raise ValueError(f"Unknown plot type: {type}, choose from: scatter, boxplot") if return_plot_data: return ax, missing_by_median return ax diff --git a/pimmslearn/plotting/defaults.py b/pimmslearn/plotting/defaults.py index d168ae627..4228cca46 100644 --- a/pimmslearn/plotting/defaults.py +++ b/pimmslearn/plotting/defaults.py @@ -9,31 +9,30 @@ # https://seaborn.pydata.org/tutorial/color_palettes.html # sns.color_palette("husl", N) to get N distinct colors color_model_mapping = { - 'KNN': sns.color_palette()[0], - 'KNN_IMPUTE': sns.color_palette()[1], - 'CF': sns.color_palette()[2], - 'DAE': sns.color_palette()[3], - 'VAE': sns.color_palette()[4], - 'RF': sns.color_palette()[5], - 'Median': sns.color_palette()[6], - 'None': sns.color_palette()[7], - 'BPCA': sns.color_palette()[8], - 'MICE-CART': sns.color_palette()[9], - + "KNN": sns.color_palette()[0], + "KNN_IMPUTE": sns.color_palette()[1], + "CF": sns.color_palette()[2], + "DAE": sns.color_palette()[3], + "VAE": sns.color_palette()[4], + "RF": sns.color_palette()[5], + "Median": sns.color_palette()[6], + "None": sns.color_palette()[7], + "BPCA": sns.color_palette()[8], + "MICE-CART": sns.color_palette()[9], } # other_colors = sns.color_palette()[8:] other_colors = sns.color_palette("husl", 20) -color_model_mapping['IMPSEQ'] = other_colors[0] -color_model_mapping['QRILC'] = other_colors[1] -color_model_mapping['IMPSEQROB'] = other_colors[1] -color_model_mapping['MICE-NORM'] = other_colors[2] -color_model_mapping['SEQKNN'] = other_colors[3] -color_model_mapping['IMPSEQROB'] = other_colors[4] -color_model_mapping['GSIMP'] = other_colors[5] -color_model_mapping['MSIMPUTE'] = other_colors[6] -color_model_mapping['MSIMPUTE_MNAR'] = other_colors[7] -color_model_mapping['TRKNN'] = other_colors[8] -color_model_mapping['SVDMETHOD'] = other_colors[9] +color_model_mapping["IMPSEQ"] = other_colors[0] +color_model_mapping["QRILC"] = other_colors[1] +color_model_mapping["IMPSEQROB"] = other_colors[1] +color_model_mapping["MICE-NORM"] = other_colors[2] +color_model_mapping["SEQKNN"] = other_colors[3] +color_model_mapping["IMPSEQROB"] = other_colors[4] +color_model_mapping["GSIMP"] = other_colors[5] +color_model_mapping["MSIMPUTE"] = other_colors[6] +color_model_mapping["MSIMPUTE_MNAR"] = other_colors[7] +color_model_mapping["TRKNN"] = other_colors[8] +color_model_mapping["SVDMETHOD"] = other_colors[9] other_colors = other_colors[10:] @@ -74,17 +73,17 @@ def _repr_html_(self): 'stroke-width:2;stroke:rgb(255,255,255)" metadata="tt"/>' ) html += f'{m}' - html += '' + html += "" return html -labels_dict = {"NA not interpolated valid_collab collab MSE": 'MSE', - 'batch_size': 'bs', - 'n_hidden_layers': "No. of hidden layers", - 'latent_dim': 'hidden layer dimension', - 'subset_w_N': 'subset', - 'n_params': 'no. of parameter', - "metric_value": 'value', - 'metric_name': 'metric', - } - +labels_dict = { + "NA not interpolated valid_collab collab MSE": "MSE", + "batch_size": "bs", + "n_hidden_layers": "No. of hidden layers", + "latent_dim": "hidden layer dimension", + "subset_w_N": "subset", + "n_params": "no. of parameter", + "metric_value": "value", + "metric_name": "metric", +} diff --git a/pimmslearn/plotting/errors.py b/pimmslearn/plotting/errors.py index 86ddedf39..84905f747 100644 --- a/pimmslearn/plotting/errors.py +++ b/pimmslearn/plotting/errors.py @@ -1,4 +1,5 @@ """Plot errors based on DataFrame with model predictions.""" + from __future__ import annotations from typing import Optional @@ -12,68 +13,76 @@ import pimmslearn.pandas.calc_errors -def plot_errors_binned(pred: pd.DataFrame, target_col='observed', - ax: Axes = None, - palette: dict = None, - metric_name: Optional[str] = None, - errwidth: float = 1.2) -> Axes: - assert target_col in pred.columns, f'Specify `target_col` parameter, `pred` do no contain: {target_col}' +def plot_errors_binned( + pred: pd.DataFrame, + target_col="observed", + ax: Axes = None, + palette: dict = None, + metric_name: Optional[str] = None, + errwidth: float = 1.2, +) -> Axes: + assert ( + target_col in pred.columns + ), f"Specify `target_col` parameter, `pred` do no contain: {target_col}" models_order = pred.columns.to_list() models_order.remove(target_col) errors_binned = pimmslearn.pandas.calc_errors.calc_errors_per_bin( - pred=pred, target_col=target_col) + pred=pred, target_col=target_col + ) - meta_cols = ['bin', 'n_obs'] # calculated along binned error - len_max_bin = len(str(int(errors_binned['bin'].max()))) - n_obs = (errors_binned[meta_cols] - .apply( - lambda x: f"{x.bin:0{len_max_bin}}\n(N={x.n_obs:,d})", axis=1 + meta_cols = ["bin", "n_obs"] # calculated along binned error + len_max_bin = len(str(int(errors_binned["bin"].max()))) + n_obs = ( + errors_binned[meta_cols] + .apply(lambda x: f"{x.bin:0{len_max_bin}}\n(N={x.n_obs:,d})", axis=1) + .rename("intensity bin") + .astype("category") ) - .rename('intensity bin') - .astype('category') + metric_name = metric_name or "Average error" + + errors_binned = ( + errors_binned[models_order] + .stack() + .to_frame(metric_name) + .join(n_obs) + .reset_index() + ) + + ax = sns.barplot( + data=errors_binned, + ax=ax, + x="intensity bin", + y=metric_name, + hue="model", + palette=palette, + errwidth=errwidth, ) - metric_name = metric_name or 'Average error' - - errors_binned = (errors_binned - [models_order] - .stack() - .to_frame(metric_name) - .join(n_obs) - .reset_index() - ) - - ax = sns.barplot(data=errors_binned, ax=ax, - x='intensity bin', y=metric_name, hue='model', - palette=palette, - errwidth=errwidth,) ax.xaxis.set_tick_params(rotation=90) return ax, errors_binned -def plot_errors_by_median(pred: pd.DataFrame, - feat_medians: pd.Series, - target_col='observed', - ax: Axes = None, - palette: dict = None, - feat_name: str = None, - metric_name: Optional[str] = None, - errwidth: float = 1.2) -> tuple[Axes, pd.DataFrame]: +def plot_errors_by_median( + pred: pd.DataFrame, + feat_medians: pd.Series, + target_col="observed", + ax: Axes = None, + palette: dict = None, + feat_name: str = None, + metric_name: Optional[str] = None, + errwidth: float = 1.2, +) -> tuple[Axes, pd.DataFrame]: # calculate absolute errors errors = pimmslearn.pandas.get_absolute_error(pred, y_true=target_col) - errors.columns.name = 'model' + errors.columns.name = "model" # define bins by integer value of median feature intensity feat_medians = feat_medians.astype(int).rename("bin") # number of intensities per bin n_obs = pred[target_col].to_frame().join(feat_medians) - n_obs = n_obs.groupby('bin').size().to_frame('n_obs') + n_obs = n_obs.groupby("bin").size().to_frame("n_obs") - errors = (errors - .stack() - .to_frame(metric_name) - .join(feat_medians) - ).reset_index() + errors = (errors.stack().to_frame(metric_name).join(feat_medians)).reset_index() n_obs.index.name = "bin" errors = errors.join(n_obs, on="bin") @@ -81,37 +90,39 @@ def plot_errors_by_median(pred: pd.DataFrame, if feat_name is None: feat_name = feat_medians.index.name if not feat_name: - feat_name = 'feature' + feat_name = "feature" - x_axis_name = f'intensity binned by median of {feat_name}' - len_max_bin = len(str(int(errors['bin'].max()))) + x_axis_name = f"intensity binned by median of {feat_name}" + len_max_bin = len(str(int(errors["bin"].max()))) errors[x_axis_name] = ( - errors[['bin', 'n_obs']] - .apply( - lambda x: f"{x.bin:0{len_max_bin}}\n(N={x.n_obs:,d})", axis=1 - ) - .rename('intensity bin') - .astype('category') + errors[["bin", "n_obs"]] + .apply(lambda x: f"{x.bin:0{len_max_bin}}\n(N={x.n_obs:,d})", axis=1) + .rename("intensity bin") + .astype("category") ) - metric_name = metric_name or 'Average error' + metric_name = metric_name or "Average error" - sns.barplot(data=errors, - ax=ax, - x=x_axis_name, - y=metric_name, - hue='model', - palette=palette, - errwidth=errwidth,) + sns.barplot( + data=errors, + ax=ax, + x=x_axis_name, + y=metric_name, + hue="model", + palette=palette, + errwidth=errwidth, + ) ax.xaxis.set_tick_params(rotation=90) return ax, errors -def get_data_for_errors_by_median(errors: pd.DataFrame, - feat_name: str, - metric_name: str, - model_column: str = 'model', - seed: int = 42) -> pd.DataFrame: +def get_data_for_errors_by_median( + errors: pd.DataFrame, + feat_name: str, + metric_name: str, + model_column: str = "model", + seed: int = 42, +) -> pd.DataFrame: """Extract Bars with confidence intervals from seaborn plot for seaborn 0.13 and above. Confident intervals are calculated with bootstrapping(sampling the mean). @@ -126,33 +137,48 @@ def get_data_for_errors_by_median(errors: pd.DataFrame, model_column: str model_column in errors, defining model names """ - x_axis_name = f'intensity binned by median of {feat_name}' + x_axis_name = f"intensity binned by median of {feat_name}" aggregator = EstimateAggregator("mean", ("ci", 95), n_boot=1_000, seed=seed) # ! need to iterate over all models myself using groupby - ret = (errors - .groupby(by=[x_axis_name, model_column,], observed=True) - [[x_axis_name, model_column, metric_name]] - .apply(lambda df: aggregator(df, metric_name)) - .reset_index()) + ret = ( + errors.groupby( + by=[ + x_axis_name, + model_column, + ], + observed=True, + )[[x_axis_name, model_column, metric_name]] + .apply(lambda df: aggregator(df, metric_name)) + .reset_index() + ) ret.columns = ["bin", model_column, "mean", "ci_low", "ci_high"] return ret - -def plot_rolling_error(errors: pd.DataFrame, metric_name: str, window: int = 200, - min_freq=None, freq_col: str = 'freq', colors_to_use=None, - ax=None): - errors_smoothed = errors.drop(freq_col, axis=1).rolling( - window=window, min_periods=1).mean() +def plot_rolling_error( + errors: pd.DataFrame, + metric_name: str, + window: int = 200, + min_freq=None, + freq_col: str = "freq", + colors_to_use=None, + ax=None, +): + errors_smoothed = ( + errors.drop(freq_col, axis=1).rolling(window=window, min_periods=1).mean() + ) errors_smoothed_max = errors_smoothed.max().max() errors_smoothed[freq_col] = errors[freq_col] if min_freq is None: min_freq = errors_smoothed[freq_col].min() else: errors_smoothed = errors_smoothed.loc[errors_smoothed[freq_col] > min_freq] - ax = errors_smoothed.plot(x=freq_col, ylabel=f'rolling average error ({metric_name})', - color=colors_to_use, - xlim=(min_freq, errors_smoothed[freq_col].max()), - ylim=(0, min(errors_smoothed_max, 5)), - ax=None) + ax = errors_smoothed.plot( + x=freq_col, + ylabel=f"rolling average error ({metric_name})", + color=colors_to_use, + xlim=(min_freq, errors_smoothed[freq_col].max()), + ylim=(0, min(errors_smoothed_max, 5)), + ax=None, + ) return ax diff --git a/pimmslearn/plotting/plotly.py b/pimmslearn/plotting/plotly.py index 8cac92b78..4cc216507 100644 --- a/pimmslearn/plotting/plotly.py +++ b/pimmslearn/plotting/plotly.py @@ -1,8 +1,7 @@ - - def apply_default_layout(fig): fig.update_layout( - font={'size': 18}, - xaxis={'title': {'standoff': 15}}, - yaxis={'title': {'standoff': 15}}) + font={"size": 18}, + xaxis={"title": {"standoff": 15}}, + yaxis={"title": {"standoff": 15}}, + ) return fig diff --git a/pimmslearn/sampling.py b/pimmslearn/sampling.py index 52dc1f8c8..cfd7318b1 100644 --- a/pimmslearn/sampling.py +++ b/pimmslearn/sampling.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) -def feature_frequency(df_wide: pd.DataFrame, measure_name: str = 'freq') -> pd.Series: +def feature_frequency(df_wide: pd.DataFrame, measure_name: str = "freq") -> pd.Series: """Generate frequency table based on singly indexed (both axes) DataFrame. Parameters @@ -36,7 +36,9 @@ def feature_frequency(df_wide: pd.DataFrame, measure_name: str = 'freq') -> pd.S return freq_per_feat.squeeze() -def frequency_by_index(df_long: pd.DataFrame, sample_index_to_drop: Union[str, int]) -> pd.Series: +def frequency_by_index( + df_long: pd.DataFrame, sample_index_to_drop: Union[str, int] +) -> pd.Series: """Generate frequency table based on an index level of a 2D multiindex. Parameters @@ -59,9 +61,13 @@ def frequency_by_index(df_long: pd.DataFrame, sample_index_to_drop: Union[str, i return freq_per_feat.squeeze() -def sample_data(series: pd.Series, sample_index_to_drop: Union[str, int], - frac=0.95, weights: pd.Series = None, - random_state=42) -> Tuple[pd.Series, pd.Series]: +def sample_data( + series: pd.Series, + sample_index_to_drop: Union[str, int], + frac=0.95, + weights: pd.Series = None, + random_state=42, +) -> Tuple[pd.Series, pd.Series]: """sample from doubly indexed series with sample index and feature index. Parameters @@ -86,10 +92,11 @@ def sample_data(series: pd.Series, sample_index_to_drop: Union[str, int], """ index_names = series.index.names new_column = index_names[sample_index_to_drop] - df = series.to_frame('intensity').reset_index(sample_index_to_drop) + df = series.to_frame("intensity").reset_index(sample_index_to_drop) df_sampled = df.groupby(by=new_column).sample( - frac=frac, weights=weights, random_state=random_state) + frac=frac, weights=weights, random_state=random_state + ) series_sampled = df_sampled.reset_index().set_index(index_names).squeeze() idx_diff = series.index.difference(series_sampled.index) @@ -97,11 +104,12 @@ def sample_data(series: pd.Series, sample_index_to_drop: Union[str, int], return series_sampled, series_not_sampled -def sample_mnar_mcar(df_long: pd.DataFrame, - frac_non_train: float, - frac_mnar: float, - random_state: int = 42 - ) -> Tuple[DataSplits, pd.Series, pd.Series, pd.Series]: +def sample_mnar_mcar( + df_long: pd.DataFrame, + frac_non_train: float, + frac_mnar: float, + random_state: int = 42, +) -> Tuple[DataSplits, pd.Series, pd.Series, pd.Series]: """Sampling of data for MNAR/MCAR simulation. The function samples from the df_long DataFrame and returns the training, validation and test splits in dhte DataSplits object. @@ -148,32 +156,22 @@ def sample_mnar_mcar(df_long: pd.DataFrame, N_MNAR = int(frac_non_train * frac_mnar * N) fake_na_mnar = df_long.loc[mask] if len(fake_na_mnar) > N_MNAR: - fake_na_mnar = fake_na_mnar.sample(N_MNAR, - random_state=random_state) + fake_na_mnar = fake_na_mnar.sample(N_MNAR, random_state=random_state) # select MCAR from remaining intensities splits = DataSplits(is_wide_format=False) - splits.train_X = df_long.loc[ - df_long.index.difference( - fake_na_mnar.index) - ] + splits.train_X = df_long.loc[df_long.index.difference(fake_na_mnar.index)] logger.info(f"{len(fake_na_mnar) = :,d}") N_MCAR = int(N * (1 - frac_mnar) * frac_non_train) - fake_na_mcar = splits.train_X.sample(N_MCAR, - random_state=random_state) + fake_na_mcar = splits.train_X.sample(N_MCAR, random_state=random_state) logger.info(f"{len(splits.train_X) = :,d}") fake_na = pd.concat([fake_na_mcar, fake_na_mnar]).squeeze() logger.info(f"{len(fake_na) = :,d}") logger.info(f"{len(fake_na_mcar) = :,d}") - splits.train_X = (splits - .train_X - .loc[splits - .train_X - .index - .difference( - fake_na_mcar.index)] - ).squeeze() + splits.train_X = ( + splits.train_X.loc[splits.train_X.index.difference(fake_na_mcar.index)] + ).squeeze() # Distribute MNAR and MCAR in validation and test set splits.val_y = fake_na.sample(frac=0.5, random_state=random_state) splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)] @@ -182,8 +180,9 @@ def sample_mnar_mcar(df_long: pd.DataFrame, return splits, thresholds, fake_na_mcar, fake_na_mnar -def get_thresholds(df_long: pd.DataFrame, frac_non_train: float, - random_state: int) -> pd.Series: +def get_thresholds( + df_long: pd.DataFrame, frac_non_train: float, random_state: int +) -> pd.Series: """Get thresholds for MNAR/MCAR sampling. Thresholds are sampled from a normal distrubiton with a mean of the quantile of the simulated missing data. @@ -218,30 +217,22 @@ def get_thresholds(df_long: pd.DataFrame, frac_non_train: float, def check_split_integrity(splits: DataSplits) -> DataSplits: """Check if IDs in are only in validation or test data for rare cases. Returns the corrected splits.""" - diff = (splits - .val_y - .index - .levels[-1] - .difference(splits - .train_X - .index - .levels[-1] - ).to_list()) + diff = ( + splits.val_y.index.levels[-1] + .difference(splits.train_X.index.levels[-1]) + .to_list() + ) if diff: logger.warning(f"Remove from val: {diff.to_list()}") to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]] splits.train_X = pd.concat([splits.train_X, to_remove]) splits.val_y = splits.val_y.drop(to_remove.index) - diff = (splits - .test_y - .index - .levels[-1] - .difference(splits - .train_X - .index - .levels[-1] - ).to_list()) + diff = ( + splits.test_y.index.levels[-1] + .difference(splits.train_X.index.levels[-1]) + .to_list() + ) if diff: logger.warning(f"Remove from test: {diff.to_list()}") to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]] diff --git a/pimmslearn/sklearn/__init__.py b/pimmslearn/sklearn/__init__.py index df1bac5e0..492e1ce05 100644 --- a/pimmslearn/sklearn/__init__.py +++ b/pimmslearn/sklearn/__init__.py @@ -2,6 +2,7 @@ Might be moved to a separate package in the future. """ + import logging from njab.sklearn import run_pca diff --git a/pimmslearn/sklearn/ae_transformer.py b/pimmslearn/sklearn/ae_transformer.py index 1fa1c4124..e0fd8f2d0 100644 --- a/pimmslearn/sklearn/ae_transformer.py +++ b/pimmslearn/sklearn/ae_transformer.py @@ -1,4 +1,5 @@ """Scikit-learn style interface for Denoising and Variational Autoencoder model.""" + from __future__ import annotations from pathlib import Path @@ -19,6 +20,7 @@ from sklearn.utils.validation import check_is_fitted import pimmslearn.models as models + # patch plotting function from pimmslearn.models import ae, plot_loss @@ -26,10 +28,8 @@ default_pipeline = sklearn.pipeline.Pipeline( - [ - ('normalize', StandardScaler()), - ('impute', SimpleImputer(add_indicator=False)) - ]) + [("normalize", StandardScaler()), ("impute", SimpleImputer(add_indicator=False))] +) class AETransformer(TransformerMixin, BaseEstimator): @@ -58,38 +58,41 @@ class AETransformer(TransformerMixin, BaseEstimator): """ - def __init__(self, - hidden_layers: list[int], - latent_dim: int = 15, - out_folder: str = '.', - model='VAE', - # y_range:Optional[tuple[int]]=None, - batch_size: int = 64, - ): + def __init__( + self, + hidden_layers: list[int], + latent_dim: int = 15, + out_folder: str = ".", + model="VAE", + # y_range:Optional[tuple[int]]=None, + batch_size: int = 64, + ): self.hidden_layers = hidden_layers self.latent_dim = latent_dim self.batch_size = batch_size self.out_folder = Path(out_folder) self.out_folder.mkdir(exist_ok=True, parents=True) - if model == 'VAE': + if model == "VAE": self.model = models.vae.VAE self.cbs = [ae.ModelAdapterVAE()] self.loss_fct = models.vae.loss_fct - elif model == 'DAE': + elif model == "DAE": self.model = ae.Autoencoder self.cbs = [ae.ModelAdapter(p=0.2)] - self.loss_fct = MSELossFlat(reduction='sum') + self.loss_fct = MSELossFlat(reduction="sum") else: raise ValueError(f'Unknown model {model}, choose either "VAE" or "DAE"') self.model_name = model - def fit(self, - X: pd.DataFrame, - y: pd.DataFrame = None, - epochs_max: int = 100, - cuda: bool = True, - patience: Optional[int] = None): + def fit( + self, + X: pd.DataFrame, + y: pd.DataFrame = None, + epochs_max: int = 100, + cuda: bool = True, + patience: Optional[int] = None, + ): """Fit the model to the data. Parameters @@ -115,13 +118,16 @@ def fit(self, train_df=X, val_df=y, model=self.model, - model_kwargs=dict(n_features=X.shape[-1], - n_neurons=self.hidden_layers, - last_decoder_activation=None, - dim_latent=self.latent_dim), + model_kwargs=dict( + n_features=X.shape[-1], + n_neurons=self.hidden_layers, + last_decoder_activation=None, + dim_latent=self.latent_dim, + ), transform=default_pipeline, - decode=['normalize'], - bs=self.batch_size) + decode=["normalize"], + bs=self.batch_size, + ) self.n_params = self.analysis.n_params_ae if cuda: @@ -130,14 +136,15 @@ def fit(self, cbs = self.cbs if patience is not None: cbs = [*self.cbs, EarlyStoppingCallback(patience=patience)] - self.analysis.learn = Learner(dls=self.analysis.dls, - model=self.analysis.model, - loss_func=self.loss_fct, - cbs=cbs - ) + self.analysis.learn = Learner( + dls=self.analysis.dls, + model=self.analysis.model, + loss_func=self.loss_fct, + cbs=cbs, + ) suggested_lr = self.analysis.learn.lr_find() - self.analysis.params['suggested_inital_lr'] = suggested_lr.valley + self.analysis.params["suggested_inital_lr"] = suggested_lr.valley self.analysis.learn.fit_one_cycle(epochs_max, lr_max=suggested_lr.valley) self.epochs_trained_ = self.analysis.learn.epoch + 1 N_train_notna = X.notna().sum().sum() @@ -145,9 +152,11 @@ def fit(self, if y is not None: N_val_notna = y.notna().sum().sum() self.fig_loss_ = models.plot_training_losses( - self.analysis.learn, self.model_name, + self.analysis.learn, + self.model_name, folder=self.out_folder, - norm_factors=[N_train_notna, N_val_notna]) + norm_factors=[N_train_notna, N_val_notna], + ) return self def transform(self, X): @@ -165,7 +174,7 @@ def transform(self, X): Return the imputed DataFrame using the model. """ # Check is fit had been called - check_is_fitted(self, 'epochs_trained_') + check_is_fitted(self, "epochs_trained_") self.analysis.model.eval() @@ -173,5 +182,6 @@ def transform(self, X): df=X, learn=self.analysis.learn, position_pred_tuple=0, - transformer=self.analysis.transform) + transformer=self.analysis.transform, + ) return X.fillna(pred) diff --git a/pimmslearn/sklearn/cf_transformer.py b/pimmslearn/sklearn/cf_transformer.py index be1bc184a..1b9ab5810 100644 --- a/pimmslearn/sklearn/cf_transformer.py +++ b/pimmslearn/sklearn/cf_transformer.py @@ -1,4 +1,5 @@ """Scikit-learn style interface for Collaborative Filtering model.""" + from __future__ import annotations from pathlib import Path @@ -21,6 +22,7 @@ import pimmslearn import pimmslearn.models as models + # patch plotting function from pimmslearn.models import collab, plot_loss @@ -55,14 +57,15 @@ class CollaborativeFilteringTransformer(TransformerMixin, BaseEstimator): """ - def __init__(self, - target_column: str, - sample_column: str, - item_column: str, - n_factors: int = 15, - out_folder: str = '.', - batch_size: int = 4096, - ): + def __init__( + self, + target_column: str, + sample_column: str, + item_column: str, + n_factors: int = 15, + out_folder: str = ".", + batch_size: int = 4096, + ): self.target_column = target_column self.item_column = item_column self.sample_column = sample_column @@ -71,10 +74,14 @@ def __init__(self, self.out_folder.mkdir(exist_ok=True, parents=True) self.batch_size = batch_size - def fit(self, X: pd.Series, y: pd.Series = None, - epochs_max=20, - cuda: bool = True, - patience: int = 1): + def fit( + self, + X: pd.Series, + y: pd.Series = None, + epochs_max=20, + cuda: bool = True, + patience: int = 1, + ): """Fit the collaborative filtering model to the data provided in long-format. Parameters @@ -105,8 +112,7 @@ def fit(self, X: pd.Series, y: pd.Series = None, """ self.model_kwargs = dict( n_factors=self.n_factors, - y_range=(int(X.squeeze().min()), - int(X.squeeze().max()) + 1) + y_range=(int(X.squeeze().min()), int(X.squeeze().max()) + 1), ) if not cuda: default_device(use=False) # set to cpu @@ -124,36 +130,38 @@ def fit(self, X: pd.Series, y: pd.Series = None, splits = idx_splitter(X) self.cat_names = [self.sample_column, self.item_column] - self.to = TabularCollab(df=X, - procs=[Categorify], - cat_names=self.cat_names, - y_names=[self.target_column], - y_block=TransformBlock(), - splits=splits) - self.dls = self.to.dataloaders(path='.', bs=self.batch_size) + self.to = TabularCollab( + df=X, + procs=[Categorify], + cat_names=self.cat_names, + y_names=[self.target_column], + y_block=TransformBlock(), + splits=splits, + ) + self.dls = self.to.dataloaders(path=".", bs=self.batch_size) self.model = EmbeddingDotBias.from_classes( - classes=self.dls.classes, - **self.model_kwargs) + classes=self.dls.classes, **self.model_kwargs + ) self.n_params = models.calc_net_weight_count(self.model) - self.learn = Learner(dls=self.dls, - model=self.model, - loss_func=MSELossFlat(), - cbs=EarlyStoppingCallback(patience=patience) if y is not None else None, - model_dir=self.out_folder) + self.learn = Learner( + dls=self.dls, + model=self.model, + loss_func=MSELossFlat(), + cbs=EarlyStoppingCallback(patience=patience) if y is not None else None, + model_dir=self.out_folder, + ) if cuda: self.learn.model = self.learn.model.cuda() suggested_lr = self.learn.lr_find() print(f"{suggested_lr.valley = :.5f}") - self.learn.fit_one_cycle(epochs_max, - - lr_max=suggested_lr.valley) + self.learn.fit_one_cycle(epochs_max, lr_max=suggested_lr.valley) self.plot_loss(y) self.epochs_trained_ = self.learn.epoch + 1 - self.model_kwargs['suggested_inital_lr'] = suggested_lr.valley + self.model_kwargs["suggested_inital_lr"] = suggested_lr.valley # ? own method? # self.learn.save('collab_model') @@ -174,7 +182,7 @@ def transform(self, X): The complete data with imputed values in long format """ # Check is fit had been called - check_is_fitted(self, 'epochs_trained_') + check_is_fitted(self, "epochs_trained_") # ! Input validation # X = check_array(X, accept_sparse=True) @@ -190,15 +198,16 @@ def transform(self, X): def plot_loss(self, y, figsize=(8, 4), save: bool = False): # -> Axes: """Plot the training and validation loss of the model.""" fig, ax = plt.subplots(figsize=figsize) - ax.set_title('CF loss: Reconstruction loss') - self.learn.recorder.plot_loss(skip_start=5, ax=ax, - with_valid=True if y is not None else False) - self.model_kwargs['batch_size'] = self.batch_size + ax.set_title("CF loss: Reconstruction loss") + self.learn.recorder.plot_loss( + skip_start=5, ax=ax, with_valid=True if y is not None else False + ) + self.model_kwargs["batch_size"] = self.batch_size if save: - fig.savefig(self.out_folder / 'loss.png') - pimmslearn.savefig(fig, name='collab_training', - folder=self.out_folder) - - pimmslearn.io.dump_json(self.model_kwargs, - self.out_folder / 'model_params_{}.json'.format('CF')) + fig.savefig(self.out_folder / "loss.png") + pimmslearn.savefig(fig, name="collab_training", folder=self.out_folder) + + pimmslearn.io.dump_json( + self.model_kwargs, self.out_folder / "model_params_{}.json".format("CF") + ) return ax diff --git a/pimmslearn/transform.py b/pimmslearn/transform.py index 06bfa014a..bc381f1fc 100644 --- a/pimmslearn/transform.py +++ b/pimmslearn/transform.py @@ -12,6 +12,7 @@ # ! general transform and inverse_transform needs to move somewhere else + def transform(self, X, **kwargs): res = super(self.__class__, self).transform(X, **kwargs) if isinstance(X, pd.DataFrame): @@ -39,17 +40,23 @@ def inverse_transform(self, X, **kwargs): def make_pandas_compatible(cls): """Patch transform and inverse_transform.""" # ? could become factory function, build args dictionary - _fcts = ['transform', 'inverse_transform'] + _fcts = ["transform", "inverse_transform"] for _fct in _fcts: if not hasattr(cls, _fct): raise ValueError(f"no {_fct} method for {cls.__name__}") - new_class = type(cls.__name__, (cls,), dict( - transform=transform, inverse_transform=inverse_transform)) + new_class = type( + cls.__name__, + (cls,), + dict(transform=transform, inverse_transform=inverse_transform), + ) new_class.transform.__doc__ = cls.transform.__doc__ + msg_return_docstring - new_class.inverse_transform.__doc__ = cls.inverse_transform.__doc__ + msg_return_docstring + new_class.inverse_transform.__doc__ = ( + cls.inverse_transform.__doc__ + msg_return_docstring + ) return new_class + # ? Can this be a MixIn class? # # this could be a class method # @make_pandas_compatible @@ -64,11 +71,15 @@ def make_pandas_compatible(cls): MinMaxScaler = make_pandas_compatible(preprocessing.MinMaxScaler) -class VaepPipeline(): +class VaepPipeline: """Custom Pipeline combining a pandas.DataFrame and a sklearn.pipeline.Pipleine.""" - def __init__(self, df_train: pd.DataFrame, encode: sklearn.pipeline.Pipeline, - decode: List[str] = None): + def __init__( + self, + df_train: pd.DataFrame, + encode: sklearn.pipeline.Pipeline, + decode: List[str] = None, + ): """[summary] Parameters @@ -88,9 +99,7 @@ def __init__(self, df_train: pd.DataFrame, encode: sklearn.pipeline.Pipeline, if decode: self.decode = list() for d in decode: - self.decode.append( - (d, self.encode.named_steps[d]) - ) + self.decode.append((d, self.encode.named_steps[d])) self.decode = sklearn.pipeline.Pipeline(self.decode) else: diff --git a/pimmslearn/utils.py b/pimmslearn/utils.py index e9c466000..3e9c356ca 100644 --- a/pimmslearn/utils.py +++ b/pimmslearn/utils.py @@ -6,10 +6,12 @@ from pimmslearn.io.datasplits import long_format -def append_to_filepath(filepath: Union[pathlib.Path, str], - to_append: str, - sep: str = '_', - new_suffix: str = None) -> pathlib.Path: +def append_to_filepath( + filepath: Union[pathlib.Path, str], + to_append: str, + sep: str = "_", + new_suffix: str = None, +) -> pathlib.Path: """Append filepath with specified to_append using a seperator. Example: `data.csv` to data_processed.csv @@ -18,19 +20,19 @@ def append_to_filepath(filepath: Union[pathlib.Path, str], suffix = filepath.suffix if new_suffix: suffix = f".{new_suffix}" - new_fp = filepath.parent / f'{filepath.stem}{sep}{to_append}{suffix}' + new_fp = filepath.parent / f"{filepath.stem}{sep}{to_append}{suffix}" return new_fp -def create_random_missing_data(N, M, - mean: float = 25.0, std_dev: float = 2.0, - prop_missing: float = 0.15): +def create_random_missing_data( + N, M, mean: float = 25.0, std_dev: float = 2.0, prop_missing: float = 0.15 +): data = np.random.normal(loc=mean, scale=std_dev, size=(N, M)) prop_missing = float(prop_missing) if prop_missing > 0.0 and prop_missing < 1.0: - mask = np.random.choice([False, True], - size=data.shape, - p=[prop_missing, 1 - prop_missing]) + mask = np.random.choice( + [False, True], size=data.shape, p=[prop_missing, 1 - prop_missing] + ) data = np.where(mask, data, np.nan) return data @@ -39,17 +41,20 @@ def create_random_missing_data_long(N: int, M: int, prop_missing=0.1): """Build example long""" data = create_random_missing_data(N=N, M=M, prop_missing=prop_missing) df_long = long_format(pd.DataFrame(data)) - df_long.index.names = ('Sample ID', 'peptide') + df_long.index.names = ("Sample ID", "peptide") df_long.reset_index(inplace=True) return df_long -def create_random_df(N: int, M: int, - scaling_factor: float = 30.0, - prop_na: float = 0.0, - start_idx: int = 0, - name_index='Sample ID', - name_columns='peptide'): +def create_random_df( + N: int, + M: int, + scaling_factor: float = 30.0, + prop_na: float = 0.0, + start_idx: int = 0, + name_index="Sample ID", + name_columns="peptide", +): X = np.random.rand(N, M) if prop_na > 0.0 and prop_na < 1.0: @@ -58,10 +63,11 @@ def create_random_df(N: int, M: int, X *= scaling_factor - X = pd.DataFrame(X, - index=[f'sample_{i:0{len(str(N))}}' - for i in range(start_idx, start_idx + N)], - columns=(f'feat_{i:0{len(str(M))}}' for i in range(M))) + X = pd.DataFrame( + X, + index=[f"sample_{i:0{len(str(N))}}" for i in range(start_idx, start_idx + N)], + columns=(f"feat_{i:0{len(str(M))}}" for i in range(M)), + ) X.index.name = name_index X.columns.name = name_columns return X From 3c0b2c4e5058629acd2bed32f73b23d9d0887768 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 08:36:16 +0100 Subject: [PATCH 02/12] :art: isort imports, configure to work with black --- pimmslearn/analyzers/diff_analysis.py | 3 ++- pimmslearn/databases/diseases.py | 1 + pimmslearn/io/load.py | 2 +- pimmslearn/logging.py | 4 ++-- pimmslearn/models/analysis.py | 4 ++-- pimmslearn/models/collect_dumps.py | 8 +++++--- pimmslearn/nb.py | 4 ++-- pimmslearn/plotting/errors.py | 1 - pimmslearn/utils.py | 1 + pyproject.toml | 3 +++ 10 files changed, 19 insertions(+), 12 deletions(-) diff --git a/pimmslearn/analyzers/diff_analysis.py b/pimmslearn/analyzers/diff_analysis.py index 270c75565..57356a42c 100644 --- a/pimmslearn/analyzers/diff_analysis.py +++ b/pimmslearn/analyzers/diff_analysis.py @@ -1,6 +1,7 @@ from __future__ import annotations -from collections import namedtuple + import logging +from collections import namedtuple import pandas as pd diff --git a/pimmslearn/databases/diseases.py b/pimmslearn/databases/diseases.py index 361bbbd28..304db8770 100644 --- a/pimmslearn/databases/diseases.py +++ b/pimmslearn/databases/diseases.py @@ -1,4 +1,5 @@ import logging + import requests logger = logging.getLogger(__name__) diff --git a/pimmslearn/io/load.py b/pimmslearn/io/load.py index b29a80070..13e64ada5 100644 --- a/pimmslearn/io/load.py +++ b/pimmslearn/io/load.py @@ -1,5 +1,5 @@ import logging -from typing import Union, List +from typing import List, Union import pandas as pd diff --git a/pimmslearn/logging.py b/pimmslearn/logging.py index dacd3a39e..9d57059ed 100644 --- a/pimmslearn/logging.py +++ b/pimmslearn/logging.py @@ -1,9 +1,9 @@ """Custom logging setup for notebooks.""" -from pathlib import Path -from datetime import datetime import logging import sys +from datetime import datetime +from pathlib import Path LOG_FOLDER = Path("logs") LOG_FOLDER.mkdir(exist_ok=True) diff --git a/pimmslearn/models/analysis.py b/pimmslearn/models/analysis.py index 6b1318458..4003607c8 100644 --- a/pimmslearn/models/analysis.py +++ b/pimmslearn/models/analysis.py @@ -1,8 +1,8 @@ -import pimmslearn.transform -import torch.nn import fastai.data.core import fastai.learner +import torch.nn +import pimmslearn.transform from pimmslearn.analyzers import Analysis diff --git a/pimmslearn/models/collect_dumps.py b/pimmslearn/models/collect_dumps.py index d48d11007..5ac308cbc 100644 --- a/pimmslearn/models/collect_dumps.py +++ b/pimmslearn/models/collect_dumps.py @@ -1,11 +1,13 @@ """Collects metrics and config files from the experiment directory structure.""" -from functools import partial, update_wrapper +import json import logging +from functools import partial, update_wrapper from pathlib import Path -import json +from typing import Callable, Iterable + import yaml -from typing import Iterable, Callable + import pimmslearn.pandas logger = logging.getLogger(__name__) diff --git a/pimmslearn/nb.py b/pimmslearn/nb.py index 9502d23b1..86f533a9c 100644 --- a/pimmslearn/nb.py +++ b/pimmslearn/nb.py @@ -1,11 +1,11 @@ +import logging from pathlib import Path from pprint import pformat + import yaml import pimmslearn.io -import logging - logger = logging.getLogger() diff --git a/pimmslearn/plotting/errors.py b/pimmslearn/plotting/errors.py index 84905f747..caa346a1b 100644 --- a/pimmslearn/plotting/errors.py +++ b/pimmslearn/plotting/errors.py @@ -9,7 +9,6 @@ from matplotlib.axes import Axes from seaborn.categorical import EstimateAggregator - import pimmslearn.pandas.calc_errors diff --git a/pimmslearn/utils.py b/pimmslearn/utils.py index 3e9c356ca..27f819596 100644 --- a/pimmslearn/utils.py +++ b/pimmslearn/utils.py @@ -1,5 +1,6 @@ import pathlib from typing import Union + import numpy as np import pandas as pd diff --git a/pyproject.toml b/pyproject.toml index 7dc9a7641..0d5ad0d14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,5 +63,8 @@ requires = ["setuptools>=64", "setuptools_scm>=8", "wheel"] # https://setuptools-scm.readthedocs.io/ # used to pick up the version from the git tags or the latest commit. +[tool.isort] +profile = "black" + [tool.setuptools.packages.find] include = ["pimmslearn"] From 51dc32839cf4a38112bfaa7451ffc9fa4c88fc00 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 08:37:29 +0100 Subject: [PATCH 03/12] =?UTF-8?q?=F0=9F=94=A7=20set=20up=20basic=20ruff=20?= =?UTF-8?q?checks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pimmslearn/models/__init__.py | 2 +- pyproject.toml | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pimmslearn/models/__init__.py b/pimmslearn/models/__init__.py index c11ef2228..7b651741c 100644 --- a/pimmslearn/models/__init__.py +++ b/pimmslearn/models/__init__.py @@ -116,7 +116,7 @@ def plot_training_losses( norm_train=norm_train, norm_val=norm_val, ) - if not folder is None: + if folder is not None: name = name.lower() _ = RecorderDump(learner.recorder, name).save(folder) pimmslearn.savefig(fig, name=f"{name}_training", folder=folder) diff --git a/pyproject.toml b/pyproject.toml index 0d5ad0d14..b34c6b0e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,17 @@ docs = [ [tool.flake8] max-line-length = 120 aggressive = 2 -ignore = ["E501"] + +[tool.ruff] + +[tool.ruff.lint] +# https://docs.astral.sh/ruff/tutorial/#rule-selection +# 1. Enable flake8-bugbear (`B`) rules +# 2. Enable pycodestyle (`E`) errors and (`W`) warnings +# 3. Pyflakes (`F`) errors +# extend-select = ["E", "W", "F", "B"] +# Ignore star imports used by fastai +ignore = ["F403"] [build-system] build-backend = "setuptools.build_meta" From e64c47164030b68196605400ecd74fd667d1a593 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 08:47:40 +0100 Subject: [PATCH 04/12] :art: remove variables, initialize with tuple --- pimmslearn/analyzers/compare_predictions.py | 5 ++++- pimmslearn/cmd_interface/setup_diff_analysis_website.py | 2 -- pimmslearn/cmd_interface/setup_imp_cp_website.py | 2 -- pimmslearn/plotting/__init__.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pimmslearn/analyzers/compare_predictions.py b/pimmslearn/analyzers/compare_predictions.py index 4b40c1036..ab696fd83 100644 --- a/pimmslearn/analyzers/compare_predictions.py +++ b/pimmslearn/analyzers/compare_predictions.py @@ -6,12 +6,15 @@ import pandas as pd -def load_predictions(pred_files: List, shared_columns=["observed"]): +def load_predictions(pred_files: List, shared_columns=("observed",)): pred_files = iter(pred_files) fname = next(pred_files) pred = pd.read_csv(fname, index_col=[0, 1]) + # cast to list + shared_columns = list(shared_columns) + for fname in pred_files: _pred_file = pd.read_csv(fname, index_col=[0, 1]) idx_shared = pred.index.intersection(_pred_file.index) diff --git a/pimmslearn/cmd_interface/setup_diff_analysis_website.py b/pimmslearn/cmd_interface/setup_diff_analysis_website.py index 4965cf5cf..b68f88fd7 100644 --- a/pimmslearn/cmd_interface/setup_diff_analysis_website.py +++ b/pimmslearn/cmd_interface/setup_diff_analysis_website.py @@ -68,14 +68,12 @@ def main(): for _f in subfolder_comparison.glob("**/*.ipynb") if _f.is_file() ] - nbs groups = defaultdict(list) for nb in nbs: _group = nb.name.split("_")[1] groups[_group].append(nb) groups = dict(groups) - groups # Parse notebooks present in imputation workflow diff --git a/pimmslearn/cmd_interface/setup_imp_cp_website.py b/pimmslearn/cmd_interface/setup_imp_cp_website.py index 42c7d2b81..b7c5bba3a 100644 --- a/pimmslearn/cmd_interface/setup_imp_cp_website.py +++ b/pimmslearn/cmd_interface/setup_imp_cp_website.py @@ -129,14 +129,12 @@ def main(): folder_experiment = Path(folder_experiment) nbs = [_f.name for _f in folder_experiment.iterdir() if _f.suffix == ".ipynb"] - nbs groups = defaultdict(list) for nb in nbs: _group = nb.split("_")[1] groups[_group].append(nb) groups = dict(groups) - groups # Parse notebooks present in imputation workflow diff --git a/pimmslearn/plotting/__init__.py b/pimmslearn/plotting/__init__.py index 9cf52007d..b0f1d3fd5 100644 --- a/pimmslearn/plotting/__init__.py +++ b/pimmslearn/plotting/__init__.py @@ -110,7 +110,7 @@ def select_dates(date_series: pd.Series, max_ticks=30) -> np.array: if offset > 1: return xticks[::offset] else: - xticks + return xticks def make_large_descriptors(size="xx-large"): From 2ec4085aa6fb75a87550bdda12e3b7f4f49f84af Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 08:48:02 +0100 Subject: [PATCH 05/12] :art: raise errors from other errors correctly --- pimmslearn/analyzers/analyzers.py | 4 ++-- pimmslearn/io/datasplits.py | 4 ++-- pimmslearn/nb.py | 6 ++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pimmslearn/analyzers/analyzers.py b/pimmslearn/analyzers/analyzers.py index 68931a08b..e510b1e80 100644 --- a/pimmslearn/analyzers/analyzers.py +++ b/pimmslearn/analyzers/analyzers.py @@ -418,11 +418,11 @@ def plot_by_category(self, meta_key: str, save: bool = True): def _plot(self, fct, meta_key: str, save: bool = True): try: meta_data = self.meta_data[meta_key] - except KeyError: + except KeyError as e: raise ValueError( f"Requested key: '{meta_key}' is not in available," f" use: {', '.join(x for x in self.meta_data.columns)}" - ) + ) from e fig, ax = plt.subplots(figsize=self.fig_size) _ = fct( df=self.latent_reduced, diff --git a/pimmslearn/io/datasplits.py b/pimmslearn/io/datasplits.py index 70498d414..a3f236bea 100644 --- a/pimmslearn/io/datasplits.py +++ b/pimmslearn/io/datasplits.py @@ -178,11 +178,11 @@ def interpolate(self, dataset: Union[str, pd.DataFrame]): elif issubclass(type(dataset), str): try: ds = getattr(self, dataset) - except AttributeError: + except AttributeError as e: raise AttributeError( f"Please provide a valid attribute, not '{dataset}'. " "Valid attributes are {}".format(", ".join(x for x in self._items)) - ) + ) from e if dataset[-1] in ["y", "Y"]: logger.warning( f"Attempting to interpolate target: {dataset} " diff --git a/pimmslearn/nb.py b/pimmslearn/nb.py index 86f533a9c..cb5179105 100644 --- a/pimmslearn/nb.py +++ b/pimmslearn/nb.py @@ -37,8 +37,10 @@ def dump(self, fname=None): try: fname = self.out_folder fname = Path(fname) / "model_config.yml" - except AttributeError: - raise AttributeError('Specify fname or set "out_folder" attribute.') + except AttributeError as e: + raise AttributeError( + 'Specify fname or set "out_folder" attribute.' + ) from e d = pimmslearn.io.parse_dict(input_dict=self.__dict__) with open(fname, "w") as f: yaml.dump(d, f) From 66785055e1b79a8e51ac1bcb27d26221c07fb042 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 08:48:33 +0100 Subject: [PATCH 06/12] =?UTF-8?q?=F0=9F=94=A7=20enable=20more=20checks=20a?= =?UTF-8?q?nd=20adjust=20line=20length=20to=20120?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b34c6b0e8..0a462e9f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,10 +61,13 @@ aggressive = 2 # 1. Enable flake8-bugbear (`B`) rules # 2. Enable pycodestyle (`E`) errors and (`W`) warnings # 3. Pyflakes (`F`) errors -# extend-select = ["E", "W", "F", "B"] +extend-select = ["E", "W", "F", "B"] # Ignore star imports used by fastai ignore = ["F403"] +[tool.ruff.lint.pycodestyle] +max-line-length = 120 + [build-system] build-backend = "setuptools.build_meta" requires = ["setuptools>=64", "setuptools_scm>=8", "wheel"] From 27db0345ae50836a637d8c8ba8b7db6efbd0d095 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 12:13:47 +0100 Subject: [PATCH 07/12] :art: format conf.py --- docs/conf.py | 74 +++++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index ce50613ae..a76deacd1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,9 +15,9 @@ # -- Project information ----------------------------------------------------- -project = 'pimms' -copyright = '2023, Henry Webel' -author = 'Henry Webel' +project = "pimms" +copyright = "2023, Henry Webel" +author = "Henry Webel" PACKAGE_VERSION = metadata.version("pimms-learn") version = PACKAGE_VERSION @@ -30,25 +30,23 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.napoleon', - 'sphinx.ext.autodoc', - 'sphinx.ext.autodoc.typehints', - 'sphinx.ext.intersphinx', - 'sphinx.ext.viewcode', - 'myst_nb', - 'sphinx_new_tab_link', + "sphinx.ext.napoleon", + "sphinx.ext.autodoc", + "sphinx.ext.autodoc.typehints", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", + "myst_nb", + "sphinx_new_tab_link", ] -myst_enable_extensions = [ - "strikethrough", - "dollarmath", - "amsmath" -] +myst_enable_extensions = ["strikethrough", "dollarmath", "amsmath"] # Plolty support through require javascript library # https://myst-nb.readthedocs.io/en/latest/render/interactive.html#plotly -html_js_files = ["https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"] +html_js_files = [ + "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js" +] # https://myst-nb.readthedocs.io/en/latest/computation/execute.html nb_execution_mode = "auto" @@ -62,16 +60,17 @@ nb_merge_streams = True # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', - 'Thumbs.db', - '.DS_Store', - 'jupyter_execute', # avoid local re-execution of written nbs during development - ] +exclude_patterns = [ + "_build", + "Thumbs.db", + ".DS_Store", + "jupyter_execute", # avoid local re-execution of written nbs during development +] # Intersphinx options intersphinx_mapping = { @@ -88,8 +87,8 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'sphinx_book_theme' # pip install sphinx-book-theme -html_logo = 'logos/logo.png' +html_theme = "sphinx_book_theme" # pip install sphinx-book-theme +html_logo = "logos/logo.png" html_favicon = "logos/logo_thumbnail.png" # See: @@ -135,17 +134,20 @@ def run_apidoc(_): from sphinx.ext import apidoc - apidoc.main([ - "--force", - "--implicit-namespaces", - "--module-first", - # "--separate", - "-o", - str(PROJECT_ROOT / "docs" / "reference"), - str(PACKAGE_ROOT), - str(PACKAGE_ROOT / "*.c"), - str(PACKAGE_ROOT / "*.so"), - ]) + + apidoc.main( + [ + "--force", + "--implicit-namespaces", + "--module-first", + # "--separate", + "-o", + str(PROJECT_ROOT / "docs" / "reference"), + str(PACKAGE_ROOT), + str(PACKAGE_ROOT / "*.c"), + str(PACKAGE_ROOT / "*.so"), + ] + ) def setup(app): - app.connect('builder-inited', run_apidoc) + app.connect("builder-inited", run_apidoc) From 1d222b6f70c25b9b2abc3ca624261f91b6ea679f Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 12:15:20 +0100 Subject: [PATCH 08/12] :art: format tests --- .github/workflows/ci.yaml | 2 + project/01_0_split_data.py | 555 +++++++++++++----------- tests/io/test_data_objects.py | 8 +- tests/io/test_dataloaders.py | 16 +- tests/io/test_dataset.py | 29 +- tests/io/test_datasplits.py | 51 ++- tests/pandas/test_calc_errors.py | 410 +++++++++-------- tests/plotting/test_defaults.py | 10 +- tests/plotting/test_errors.py | 70 +-- tests/test_ae.py | 1 - tests/test_collab.py | 14 +- tests/test_imports.py | 2 +- tests/test_imputation.py | 12 +- tests/test_io.py | 6 +- tests/test_models/test_collect_dumps.py | 14 +- tests/test_models/test_transformers.py | 35 +- tests/test_nb.py | 4 +- tests/test_pandas.py | 87 ++-- tests/test_sampling.py | 39 +- tests/test_transfrom.py | 339 ++++++++------- tests/test_utils.py | 14 +- 21 files changed, 951 insertions(+), 767 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bd1f7fdc5..b0af57f9b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -17,6 +17,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: psf/black@stable + with: + src: pimmslearn - uses: isort/isort-action@v1 lint: name: Lint with ruff diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py index 4142e0995..3a8798011 100644 --- a/project/01_0_split_data.py +++ b/project/01_0_split_data.py @@ -40,7 +40,7 @@ logger = pimmslearn.logging.setup_nb_logger() logger.info("Split data and make diagnostic plots") -logging.getLogger('fontTools').setLevel(logging.WARNING) +logging.getLogger("fontTools").setLevel(logging.WARNING) def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): @@ -49,13 +49,12 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): except KeyError as e: logger.warning(e) logger.warning("Ignore missing samples in quantified samples") - df = df.loc[df.index.intersection( - df_meta.index)] + df = df.loc[df.index.intersection(df_meta.index)] return df_meta pd.options.display.max_columns = 32 -plt.rcParams['figure.figsize'] = [4, 2] +plt.rcParams["figure.figsize"] = [4, 2] pimmslearn.plotting.make_large_descriptors(7) @@ -71,26 +70,44 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): args = dict(globals()).keys() # %% tags=["parameters"] -FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' # Sample (rows), features (columns) -index_col: Union[str, int] = 0 # Can be either a string or position (default 0 for first column), or a list of these. -column_names: List[str] = ["Gene Names"] # Manuelly set column names (of Index object in columns) -fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # metadata for samples (rows) -feat_prevalence: Union[int, float] = 0.25 # Minimum number or fraction of feature prevalence across samples to be kept -sample_completeness: Union[int, float] = 0.5 # Minimum number or fraction of total requested features per Sample +FN_INTENSITIES: str = ( + "data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv" # Sample (rows), features (columns) +) +index_col: Union[str, int] = ( + 0 # Can be either a string or position (default 0 for first column), or a list of these. +) +column_names: List[str] = [ + "Gene Names" +] # Manuelly set column names (of Index object in columns) +fn_rawfile_metadata: str = ( + "data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv" # metadata for samples (rows) +) +feat_prevalence: Union[int, float] = ( + 0.25 # Minimum number or fraction of feature prevalence across samples to be kept +) +sample_completeness: Union[int, float] = ( + 0.5 # Minimum number or fraction of total requested features per Sample +) select_N: int = None # only use latest N samples sample_N: bool = False # if select_N, sample N randomly instead of using latest N random_state: int = 42 # random state for reproducibility of splits -logarithm: str = 'log2' # Log transformation of initial data (select one of the existing in numpy) -folder_experiment: str = 'runs/example' # folder to save figures and data dumps -folder_data: str = '' # specify special data directory if needed -file_format: str = 'csv' # file format of create splits, default pickle (pkl) -use_every_nth_xtick: int = 1 # use every nth xtick in plots (default 1, i.e. every xtick is kept) +logarithm: str = ( + "log2" # Log transformation of initial data (select one of the existing in numpy) +) +folder_experiment: str = "runs/example" # folder to save figures and data dumps +folder_data: str = "" # specify special data directory if needed +file_format: str = "csv" # file format of create splits, default pickle (pkl) +use_every_nth_xtick: int = ( + 1 # use every nth xtick in plots (default 1, i.e. every xtick is kept) +) # metadata -> defaults for metadata extracted from machine data, used for plotting meta_date_col: str = None # date column in meta data meta_cat_col: str = None # category column in meta data # train, validation and test data splits frac_non_train: float = 0.1 # fraction of non training data (validation and test split) -frac_mnar: float = 0.0 # fraction of missing not at random data, rest: missing completely at random +frac_mnar: float = ( + 0.0 # fraction of missing not at random data, rest: missing completely at random +) prop_sample_w_sim: float = 1.0 # proportion of samples with simulated missing values feat_name_display: str = None # display name for feature name (e.g. 'protein group') @@ -105,11 +122,12 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): # %% tags=["hide-input"] if not 0.0 <= args.frac_mnar <= 1.0: - raise ValueError("Invalid MNAR float value (should be betw. 0 and 1):" - f" {args.frac_mnar}") + raise ValueError( + "Invalid MNAR float value (should be betw. 0 and 1):" f" {args.frac_mnar}" + ) if isinstance(args.index_col, str) or isinstance(args.index_col, int): - args.overwrite_entry('index_col', [args.index_col]) + args.overwrite_entry("index_col", [args.index_col]) args.index_col # make sure it is an iterable # %% [markdown] @@ -122,63 +140,61 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): logger.info(f"{args.FN_INTENSITIES = }") -FILE_FORMAT_TO_CONSTRUCTOR = {'csv': 'from_csv', - 'pkl': 'from_pickle', - 'pickle': 'from_pickle', - } +FILE_FORMAT_TO_CONSTRUCTOR = { + "csv": "from_csv", + "pkl": "from_pickle", + "pickle": "from_pickle", +} FILE_EXT = Path(args.FN_INTENSITIES).suffix[1:] -logger.info( - f"File format (extension): {FILE_EXT} (!specifies data loading function!)") +logger.info(f"File format (extension): {FILE_EXT} (!specifies data loading function!)") # %% tags=["hide-input"] # # ! factor out file reading to a separate module, not class # AnalyzePeptides.from_csv constructor = getattr(pimmslearn.io.load, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT]) -df = constructor(fname=args.FN_INTENSITIES, - index_col=args.index_col, - ) +df = constructor( + fname=args.FN_INTENSITIES, + index_col=args.index_col, +) if args.column_names: df.columns.names = args.column_names if args.feat_name_display is None: - args.overwrite_entry('feat_name_display', 'features') + args.overwrite_entry("feat_name_display", "features") if args.column_names: - args.overwrite_entry('feat_name_display', args.column_names[0]) + args.overwrite_entry("feat_name_display", args.column_names[0]) if not df.index.name: logger.warning("No sample index name found, setting to 'Sample ID'") - df.index.name = 'Sample ID' + df.index.name = "Sample ID" if args.logarithm: log_fct = getattr(np, args.logarithm) - df = log_fct(df) # ! potentially add check to increase value by 1 if 0 is present (should be part of preprocessing) + df = log_fct( + df + ) # ! potentially add check to increase value by 1 if 0 is present (should be part of preprocessing) df # %% tags=["hide-input"] -ax = (df - .notna() - .sum(axis=0) - .to_frame(df.columns.name) - .plot - .box() - ) -ax.set_ylabel('Frequency') +ax = df.notna().sum(axis=0).to_frame(df.columns.name).plot.box() +ax.set_ylabel("Frequency") # %% tags=["hide-input"] -fname = args.out_folder / '01_0_data_stats.xlsx' +fname = args.out_folder / "01_0_data_stats.xlsx" dumps[fname.name] = fname.as_posix() writer = pd.ExcelWriter(fname) notna = df.notna() data_stats_original = pd.concat( [ - notna.sum().describe().rename('feat_stats'), - notna.sum(axis=1).describe().rename('sample_stats') + notna.sum().describe().rename("feat_stats"), + notna.sum(axis=1).describe().rename("sample_stats"), ], - axis=1) -data_stats_original.to_excel(writer, sheet_name='data_stats_original') + axis=1, +) +data_stats_original.to_excel(writer, sheet_name="data_stats_original") data_stats_original @@ -188,6 +204,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): # # > The Collaborative Modeling approach will need a single feature column. + # %% tags=["hide-input"] def join_as_str(seq): ret = "_".join(str(x) for x in seq) @@ -214,10 +231,12 @@ def join_as_str(seq): logger.warning("No metadata for samples provided, create placeholder.") if args.meta_date_col: raise ValueError( - f"No metadata provided, but data column set: {args.meta_date_col}") + f"No metadata provided, but data column set: {args.meta_date_col}" + ) if args.meta_cat_col: raise ValueError( - f"No metadata provided, but data column set: {args.meta_cat_col}") + f"No metadata provided, but data column set: {args.meta_cat_col}" + ) df_meta = pd.DataFrame(index=df.index) df_meta = df_meta.loc[df.index.to_list()] # index is sample index if df_meta.index.name is None: @@ -226,10 +245,9 @@ def join_as_str(seq): # %% tags=["hide-input"] if args.meta_date_col: - df_meta[args.meta_date_col] = pd.to_datetime( - df_meta[args.meta_date_col]) + df_meta[args.meta_date_col] = pd.to_datetime(df_meta[args.meta_date_col]) else: - args.overwrite_entry('meta_date_col', 'PlaceholderTime') + args.overwrite_entry("meta_date_col", "PlaceholderTime") df_meta[args.meta_date_col] = range(len(df_meta)) df_meta @@ -241,7 +259,7 @@ def join_as_str(seq): df_meta = df_meta.sort_values(args.meta_date_col) # %% tags=["hide-input"] -meta_stats = df_meta.describe(include='all') +meta_stats = df_meta.describe(include="all") meta_stats # %% [markdown] @@ -249,13 +267,16 @@ def join_as_str(seq): # %% tags=["hide-input"] try: - display(meta_stats.loc[:, (meta_stats.loc['unique'] - > 1) | (meta_stats.loc['std'] > 0.1)]) + display( + meta_stats.loc[ + :, (meta_stats.loc["unique"] > 1) | (meta_stats.loc["std"] > 0.1) + ] + ) except KeyError: - if 'std' in meta_stats.index: - display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)]) - if 'unique' in meta_stats.index: - display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)]) + if "std" in meta_stats.index: + display(meta_stats.loc[:, (meta_stats.loc["std"] > 0.1)]) + if "unique" in meta_stats.index: + display(meta_stats.loc[:, (meta_stats.loc["std"] > 0.1)]) # %% tags=["hide-input"] @@ -280,12 +301,11 @@ def join_as_str(seq): if args.sample_N: df_meta = df_meta.sample(args.select_N) else: - df_meta = df_meta.iloc[-args.select_N:] + df_meta = df_meta.iloc[-args.select_N :] - df = df.loc[df_meta.index].dropna( - how='all', axis=1) - ax = df.T.describe().loc['count'].hist() - _ = ax.set_title('histogram of features for all eligable samples') + df = df.loc[df_meta.index].dropna(how="all", axis=1) + ax = df.T.describe().loc["count"].hist() + _ = ax.set_title("histogram of features for all eligable samples") # %% [markdown] # ## First Step: Select features by prevalence @@ -299,13 +319,12 @@ def join_as_str(seq): N_samples = len(df) logger.info(f"Current number of samples: {N_samples}") logger.info( - f"Feature has to be present in at least {args.feat_prevalence:.2%} of samples") - args.overwrite_entry('feat_prevalence', int( - N_samples * args.feat_prevalence)) + f"Feature has to be present in at least {args.feat_prevalence:.2%} of samples" + ) + args.overwrite_entry("feat_prevalence", int(N_samples * args.feat_prevalence)) assert isinstance(args.feat_prevalence, int) # ! check that feature prevalence is greater equal to 3 (otherwise train, val, test split is not possible) -logger.info( - f"Feature has to be present in at least {args.feat_prevalence} of samples") +logger.info(f"Feature has to be present in at least {args.feat_prevalence} of samples") # select features mask = freq_per_feature >= args.feat_prevalence logger.info(f"Drop {(~mask).sum()} features") @@ -317,11 +336,12 @@ def join_as_str(seq): notna = df.notna() data_stats_filtered = pd.concat( [ - notna.sum().describe().rename('feat_stats'), - notna.sum(axis=1).describe().rename('sample_stats') + notna.sum().describe().rename("feat_stats"), + notna.sum(axis=1).describe().rename("sample_stats"), ], - axis=1) -data_stats_filtered.to_excel(writer, sheet_name='data_stats_filtered') + axis=1, +) +data_stats_filtered.to_excel(writer, sheet_name="data_stats_filtered") data_stats_filtered # %% [markdown] @@ -332,11 +352,12 @@ def join_as_str(seq): # %% tags=["hide-input"] if isinstance(args.sample_completeness, float): - msg = f'Fraction of minimum sample completeness over all features specified with: {args.sample_completeness}\n' + msg = f"Fraction of minimum sample completeness over all features specified with: {args.sample_completeness}\n" # assumes df in wide format - args.overwrite_entry('sample_completeness', int( - df.shape[1] * args.sample_completeness)) - msg += f'This translates to a minimum number of features per sample (to be included): {args.sample_completeness}' + args.overwrite_entry( + "sample_completeness", int(df.shape[1] * args.sample_completeness) + ) + msg += f"This translates to a minimum number of features per sample (to be included): {args.sample_completeness}" logger.info(msg) sample_counts = df.notna().sum(axis=1) # if DataFrame @@ -344,11 +365,10 @@ def join_as_str(seq): # %% tags=["hide-input"] mask = sample_counts > args.sample_completeness -msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.' +msg = f"Drop {len(mask) - mask.sum()} of {len(mask)} initial samples." logger.info(msg) df = df.loc[mask] -df = df.dropna( - axis=1, how='all') # drop now missing features +df = df.dropna(axis=1, how="all") # drop now missing features # %% tags=["hide-input"] args.N, args.M = df.shape # save data dimensions @@ -360,20 +380,19 @@ def join_as_str(seq): # %% tags=["hide-input"] group = 1 ax = df.notna().sum(axis=1).hist() -ax.set_xlabel(f'{args.feat_name_display.capitalize()} per eligable sample') -ax.set_ylabel('observations') -fname = args.out_figures / f'0_{group}_hist_features_per_sample' +ax.set_xlabel(f"{args.feat_name_display.capitalize()} per eligable sample") +ax.set_ylabel("observations") +fname = args.out_figures / f"0_{group}_hist_features_per_sample" figures[fname.stem] = fname pimmslearn.savefig(ax.get_figure(), fname) # %% tags=["hide-input"] ax = df.notna().sum(axis=0).sort_values().plot() -_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()] -_ = ax.set_xticklabels(_new_labels, rotation=45, - horizontalalignment='right') -ax.set_xlabel(f'{args.feat_name_display.capitalize()} prevalence') -ax.set_ylabel('observations') -fname = args.out_figures / f'0_{group}_feature_prevalence' +_new_labels = [l_.get_text().split(";")[0] for l_ in ax.get_xticklabels()] +_ = ax.set_xticklabels(_new_labels, rotation=45, horizontalalignment="right") +ax.set_xlabel(f"{args.feat_name_display.capitalize()} prevalence") +ax.set_ylabel("observations") +fname = args.out_figures / f"0_{group}_feature_prevalence" figures[fname.stem] = fname pimmslearn.savefig(ax.get_figure(), fname) @@ -384,33 +403,40 @@ def join_as_str(seq): # %% tags=["hide-input"] min_max = pimmslearn.plotting.data.min_max(df.stack()) ax, bins = pimmslearn.plotting.data.plot_histogram_intensities( - df.stack(), min_max=min_max) -ax.set_xlabel('Intensity binned') -fname = args.out_figures / f'0_{group}_intensity_distribution_overall' + df.stack(), min_max=min_max +) +ax.set_xlabel("Intensity binned") +fname = args.out_figures / f"0_{group}_intensity_distribution_overall" figures[fname.stem] = fname pimmslearn.savefig(ax.get_figure(), fname) # %% tags=["hide-input"] ax = pimmslearn.plotting.data.plot_feat_median_over_prop_missing( - data=df, type='scatter') -fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter' + data=df, type="scatter" +) +fname = args.out_figures / f"0_{group}_intensity_median_vs_prop_missing_scatter" ax.set_xlabel( - f'{args.feat_name_display.capitalize()} binned by their median intensity' - f' (N {args.feat_name_display})') + f"{args.feat_name_display.capitalize()} binned by their median intensity" + f" (N {args.feat_name_display})" +) figures[fname.stem] = fname pimmslearn.savefig(ax.get_figure(), fname) # %% tags=["hide-input"] -ax, _data_feat_median_over_prop_missing = pimmslearn.plotting.data.plot_feat_median_over_prop_missing( - data=df, type='boxplot', return_plot_data=True) -fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot' +ax, _data_feat_median_over_prop_missing = ( + pimmslearn.plotting.data.plot_feat_median_over_prop_missing( + data=df, type="boxplot", return_plot_data=True + ) +) +fname = args.out_figures / f"0_{group}_intensity_median_vs_prop_missing_boxplot" ax.set_xlabel( - f'{args.feat_name_display.capitalize()} binned by their median intensity' - f' (N {args.feat_name_display})') + f"{args.feat_name_display.capitalize()} binned by their median intensity" + f" (N {args.feat_name_display})" +) figures[fname.stem] = fname pimmslearn.savefig(ax.get_figure(), fname) -_data_feat_median_over_prop_missing.to_csv(fname.with_suffix('.csv')) +_data_feat_median_over_prop_missing.to_csv(fname.with_suffix(".csv")) # _data_feat_median_over_prop_missing.to_excel(fname.with_suffix('.xlsx')) del _data_feat_median_over_prop_missing @@ -418,7 +444,7 @@ def join_as_str(seq): # ### Interactive and Single plots # %% tags=["hide-input"] -_feature_display_name = f'identified {args.feat_name_display}' +_feature_display_name = f"identified {args.feat_name_display}" sample_counts.name = _feature_display_name # %% tags=["hide-input"] @@ -433,24 +459,31 @@ def join_as_str(seq): pcs # %% tags=["hide-input"] -pcs.describe(include='all').T +pcs.describe(include="all").T # %% tags=["hide-input"] if args.meta_cat_col: fig, ax = plt.subplots(figsize=(3, 3)) analyzers.seaborn_scatter( - pcs[pcs_name], ax, meta=pcs[args.meta_cat_col], title=f"by {args.meta_cat_col}") - fname = (args.out_figures - / f'0_{group}_pca_sample_by_{"_".join(args.meta_cat_col.split())}') + pcs[pcs_name], ax, meta=pcs[args.meta_cat_col], title=f"by {args.meta_cat_col}" + ) + fname = ( + args.out_figures + / f'0_{group}_pca_sample_by_{"_".join(args.meta_cat_col.split())}' + ) figures[fname.stem] = fname pimmslearn.savefig(fig, fname) # %% tags=["hide-input"] -if args.meta_date_col != 'PlaceholderTime': +if args.meta_date_col != "PlaceholderTime": fig, ax = plt.subplots() analyzers.plot_date_map( - df=pcs[pcs_name], ax=ax, dates=pcs[args.meta_date_col], title=f'by {args.meta_date_col}') - fname = args.out_figures / f'0_{group}_pca_sample_by_date' + df=pcs[pcs_name], + ax=ax, + dates=pcs[args.meta_date_col], + title=f"by {args.meta_date_col}", + ) + fname = args.out_figures / f"0_{group}_pca_sample_by_date" figures[fname.stem] = fname pimmslearn.savefig(fig, fname) @@ -467,8 +500,10 @@ def join_as_str(seq): feat_name_display=args.feat_name_display, size=5, ) -fname = (args.out_figures - / f'0_{group}_pca_sample_by_{"_".join(col_identified_feat.split())}.pdf') +fname = ( + args.out_figures + / f'0_{group}_pca_sample_by_{"_".join(col_identified_feat.split())}.pdf' +) figures[fname.stem] = fname pimmslearn.savefig(fig, fname) @@ -478,18 +513,22 @@ def join_as_str(seq): # %% tags=["hide-input"] fig = px.scatter( - pcs, x=pcs_name[0], y=pcs_name[1], + pcs, + x=pcs_name[0], + y=pcs_name[1], hover_name=pcs_index_name, # hover_data=analysis.df_meta, - title=f'First two Principal Components of {args.M} {args.feat_name_display} for {pcs.shape[0]} samples', + title=f"First two Principal Components of {args.M} {args.feat_name_display} for {pcs.shape[0]} samples", # color=pcs['Software Version'], color=col_identified_feat, - template='none', + template="none", width=1200, # 4 inches x 300 dpi - height=600 # 2 inches x 300 dpi + height=600, # 2 inches x 300 dpi +) +fname = ( + args.out_figures + / f'0_{group}_pca_sample_by_{"_".join(col_identified_feat.split())}_plotly.pdf' ) -fname = (args.out_figures - / f'0_{group}_pca_sample_by_{"_".join(col_identified_feat.split())}_plotly.pdf') figures[fname.stem] = fname fig.write_image(fname) fig # stays interactive in html @@ -503,24 +542,25 @@ def join_as_str(seq): # %% tags=["hide-input"] df_w_date = df.join(df_meta[args.meta_date_col]) df_w_date = df_w_date.set_index(args.meta_date_col).sort_index() -if not args.meta_date_col == 'PlaceholderTime': - df_w_date.to_period('min') +if not args.meta_date_col == "PlaceholderTime": + df_w_date.to_period("min") df_w_date = df_w_date.T df_w_date # %% tags=["hide-input"] -ax = df_w_date.plot.box(rot=80, - figsize=(7, 3), - fontsize=7, - showfliers=False, - showcaps=False, - boxprops=dict(linewidth=.4, color='darkblue'), - flierprops=dict(markersize=.4, color='lightblue'), - ) +ax = df_w_date.plot.box( + rot=80, + figsize=(7, 3), + fontsize=7, + showfliers=False, + showcaps=False, + boxprops=dict(linewidth=0.4, color="darkblue"), + flierprops=dict(markersize=0.4, color="lightblue"), +) _ = pimmslearn.plotting.select_xticks(ax) fig = ax.get_figure() -fname = args.out_figures / f'0_{group}_median_boxplot' -df_w_date.to_pickle(fname.with_suffix('.pkl')) +fname = args.out_figures / f"0_{group}_median_boxplot" +df_w_date.to_pickle(fname.with_suffix(".pkl")) figures[fname.stem] = fname pimmslearn.savefig(fig, fname) del df_w_date @@ -537,23 +577,22 @@ def join_as_str(seq): # - the machine will be not use for intermediate periods # %% tags=["hide-input"] -if not args.meta_date_col == 'PlaceholderTime': +if not args.meta_date_col == "PlaceholderTime": dates = df_meta[args.meta_date_col].sort_values() - median_sample_intensity = (df - .median(axis=1) - .to_frame('median intensity')) + median_sample_intensity = df.median(axis=1).to_frame("median intensity") median_sample_intensity = median_sample_intensity.join(dates) - ax = median_sample_intensity.plot.scatter(x=dates.name, y='median intensity', - rot=90, - # fontsize=6, - figsize=(8, 2), - s=5, - xticks=pimmslearn.plotting.select_dates( - median_sample_intensity[dates.name]) - ) + ax = median_sample_intensity.plot.scatter( + x=dates.name, + y="median intensity", + rot=90, + # fontsize=6, + figsize=(8, 2), + s=5, + xticks=pimmslearn.plotting.select_dates(median_sample_intensity[dates.name]), + ) fig = ax.get_figure() - fname = args.out_figures / f'0_{group}_median_scatter' + fname = args.out_figures / f"0_{group}_median_scatter" figures[fname.stem] = fname pimmslearn.savefig(fig, fname) @@ -578,10 +617,10 @@ def join_as_str(seq): # %% tags=["hide-input"] # freq_per_feature.name = 'Gene names freq' # name it differently? # index.name is lost when data is stored -fname = args.data / 'freq_features.json' +fname = args.data / "freq_features.json" dumps[fname.name] = fname freq_per_feature.to_json(fname) -fname = fname.with_suffix('.pkl') +fname = fname.with_suffix(".pkl") dumps[fname.name] = fname freq_per_feature.to_pickle(fname) @@ -622,7 +661,9 @@ def join_as_str(seq): frac_mnar=args.frac_mnar, random_state=args.random_state, ) -logger.info(f"{splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }") +logger.info( + f"{splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }" +) # %% tags=["hide-input"] N = len(df_long) @@ -631,48 +672,42 @@ def join_as_str(seq): fig, axes = plt.subplots(1, 2, figsize=(6, 2)) ax = axes[0] -plot_histogram_intensities = partial(pimmslearn.plotting.data.plot_histogram_intensities, - min_max=min_max, - alpha=0.8) -plot_histogram_intensities( - df_long.squeeze(), - ax=ax, - label='observed') -plot_histogram_intensities( - thresholds, - ax=ax, - label='thresholds') +plot_histogram_intensities = partial( + pimmslearn.plotting.data.plot_histogram_intensities, min_max=min_max, alpha=0.8 +) +plot_histogram_intensities(df_long.squeeze(), ax=ax, label="observed") +plot_histogram_intensities(thresholds, ax=ax, label="thresholds") if args.use_every_nth_xtick > 1: ax.set_xticks(ax.get_xticks()[::2]) ax.legend() ax = axes[1] plot_histogram_intensities( - fake_na_mnar.squeeze(), - ax=ax, - label=f'MNAR ({N_MNAR:,d})', - color='C2') + fake_na_mnar.squeeze(), ax=ax, label=f"MNAR ({N_MNAR:,d})", color="C2" +) plot_histogram_intensities( - fake_na_mcar.squeeze(), - ax=ax, - color='C3', - label=f'MCAR ({N_MCAR:,d})') + fake_na_mcar.squeeze(), ax=ax, color="C3", label=f"MCAR ({N_MCAR:,d})" +) if args.use_every_nth_xtick > 1: ax.set_xticks(ax.get_xticks()[::2]) ax.legend() -fname = args.out_figures / f'0_{group}_mnar_mcar_histograms.pdf' +fname = args.out_figures / f"0_{group}_mnar_mcar_histograms.pdf" figures[fname.stem] = fname pimmslearn.savefig(fig, fname) # %% tags=["hide-input"] counts_per_bin = pimmslearn.pandas.get_counts_per_bin( df=pd.concat( - [df_long.squeeze().to_frame('observed'), - thresholds.to_frame('threshold'), - fake_na_mnar.squeeze().to_frame(f'MNAR ({N_MNAR:,d})'), - fake_na_mcar.squeeze().to_frame(f'MCAR ({N_MCAR:,d})')], - axis=1), - bins=range(min_max[0], min_max[1] + 1, 1)) -counts_per_bin.to_excel(fname.with_suffix('.xlsx')) + [ + df_long.squeeze().to_frame("observed"), + thresholds.to_frame("threshold"), + fake_na_mnar.squeeze().to_frame(f"MNAR ({N_MNAR:,d})"), + fake_na_mcar.squeeze().to_frame(f"MCAR ({N_MCAR:,d})"), + ], + axis=1, + ), + bins=range(min_max[0], min_max[1] + 1, 1), +) +counts_per_bin.to_excel(fname.with_suffix(".xlsx")) counts_per_bin @@ -688,25 +723,33 @@ def join_as_str(seq): if 0.0 < args.prop_sample_w_sim < 1.0: to_stratify = None if args.meta_cat_col and df_meta is not None: - to_stratify = df_meta[args.meta_cat_col].fillna(-1) # ! fillna with -1 as separate category (sofisticate check) - train_idx, val_test_idx = train_test_split(splits.train_X.index.levels[0], - test_size=args.prop_sample_w_sim, - stratify=to_stratify, - random_state=42) - val_idx, test_idx = train_test_split(val_test_idx, - test_size=.5, - stratify=to_stratify.loc[val_test_idx] if to_stratify is not None else None, - random_state=42) - logger.info(f"Sample in Train: {len(train_idx):,d} - Validation: {len(val_idx):,d} - Test: {len(test_idx):,d}") + to_stratify = df_meta[args.meta_cat_col].fillna( + -1 + ) # ! fillna with -1 as separate category (sofisticate check) + train_idx, val_test_idx = train_test_split( + splits.train_X.index.levels[0], + test_size=args.prop_sample_w_sim, + stratify=to_stratify, + random_state=42, + ) + val_idx, test_idx = train_test_split( + val_test_idx, + test_size=0.5, + stratify=to_stratify.loc[val_test_idx] if to_stratify is not None else None, + random_state=42, + ) + logger.info( + f"Sample in Train: {len(train_idx):,d} - Validation: {len(val_idx):,d} - Test: {len(test_idx):,d}" + ) # reassign some simulated missing values to training data: splits.train_X = pd.concat( - [splits.train_X, - splits.val_y.loc[train_idx], - splits.test_y.loc[train_idx] - ]) + [splits.train_X, splits.val_y.loc[train_idx], splits.test_y.loc[train_idx]] + ) splits.val_y = splits.val_y.loc[val_idx] splits.test_y = splits.test_y.loc[test_idx] - logger.info(f"New shapes: {splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }") + logger.info( + f"New shapes: {splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }" + ) # %% tags=["hide-input"] splits.test_y.groupby(level=-1).count().describe() @@ -738,7 +781,9 @@ def join_as_str(seq): mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4 if mask_min_4_measurments.any(): idx = mask_min_4_measurments.loc[mask_min_4_measurments].index - logger.warning(f"Features with less than 4 measurments in training data: {idx.to_list()}") + logger.warning( + f"Features with less than 4 measurments in training data: {idx.to_list()}" + ) to_remove = splits.val_y.loc[pd.IndexSlice[:, idx]] logger.info("To remove from validation data: ") display(to_remove) @@ -748,8 +793,10 @@ def join_as_str(seq): mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4 if mask_min_4_measurments.any(): idx = mask_min_4_measurments.loc[mask_min_4_measurments].index - raise ValueError("Some features still have less than 4 measurments in training data" - f" after removing the features from the validation data: {idx.to_list()}") + raise ValueError( + "Some features still have less than 4 measurments in training data" + f" after removing the features from the validation data: {idx.to_list()}" + ) # %% [markdown] # ### Save in long format @@ -774,11 +821,11 @@ def join_as_str(seq): # %% tags=["hide-input"] splits_df = pd.DataFrame(index=df_long.index) -splits_df['train'] = splits.train_X -splits_df['val'] = splits.val_y -splits_df['test'] = splits.test_y +splits_df["train"] = splits.train_X +splits_df["val"] = splits.val_y +splits_df["test"] = splits.test_y stats_splits = splits_df.describe() -stats_splits.to_excel(writer, 'stats_splits', float_format='%.3f') +stats_splits.to_excel(writer, "stats_splits", float_format="%.3f") stats_splits # %% tags=["hide-input"] @@ -790,72 +837,64 @@ def join_as_str(seq): # for s in ('train', 'val', 'test')] _legend = [ f'{s:<5} (N={int(stats_splits.loc["count", s]):,d})' - for s in ('train', 'val', 'test')] + for s in ("train", "val", "test") +] print(_legend) # %% tags=["hide-input"] group = 3 -ax = (splits - .train_X - .plot - .hist( - bins=bins, - ax=None, - color='C0', - )) -_ = (splits - .val_y - .plot - .hist(bins=bins, - xticks=list(bins), - ax=ax, - color='C2', - legend=True) - ) +ax = splits.train_X.plot.hist( + bins=bins, + ax=None, + color="C0", +) +_ = splits.val_y.plot.hist(bins=bins, xticks=list(bins), ax=ax, color="C2", legend=True) ax.legend(_legend[:-1]) if args.use_every_nth_xtick > 1: ax.set_xticks(ax.get_xticks()[::2]) -ax.set_xlabel('Intensity bins') -fname = args.out_figures / f'0_{group}_val_over_train_split.pdf' +ax.set_xlabel("Intensity bins") +fname = args.out_figures / f"0_{group}_val_over_train_split.pdf" figures[fname.name] = fname pimmslearn.savefig(ax.get_figure(), fname) # %% tags=["hide-input"] min_bin, max_bin = pimmslearn.plotting.data.min_max(splits.val_y) bins = range(int(min_bin), int(max_bin) + 1, 1) -ax = splits_df.plot.hist(bins=bins, - xticks=list(bins), - legend=False, - stacked=True, - color=['C0', 'C1', 'C2'], - ) +ax = splits_df.plot.hist( + bins=bins, + xticks=list(bins), + legend=False, + stacked=True, + color=["C0", "C1", "C2"], +) if args.use_every_nth_xtick > 1: ax.set_xticks(ax.get_xticks()[::2]) ax.legend(_legend) -ax.set_xlabel('Intensity bins') +ax.set_xlabel("Intensity bins") ax.yaxis.set_major_formatter("{x:,.0f}") -fname = args.out_figures / f'0_{group}_splits_freq_stacked.pdf' +fname = args.out_figures / f"0_{group}_splits_freq_stacked.pdf" figures[fname.name] = fname pimmslearn.savefig(ax.get_figure(), fname) # %% tags=["hide-input"] counts_per_bin = pimmslearn.pandas.get_counts_per_bin(df=splits_df, bins=bins) -counts_per_bin.to_excel(fname.with_suffix('.xlsx')) +counts_per_bin.to_excel(fname.with_suffix(".xlsx")) counts_per_bin # %% tags=["hide-input"] -ax = splits_df.drop('train', axis=1).plot.hist(bins=bins, - xticks=list(bins), - color=['C1', 'C2'], - legend=False, - stacked=True, - ) +ax = splits_df.drop("train", axis=1).plot.hist( + bins=bins, + xticks=list(bins), + color=["C1", "C2"], + legend=False, + stacked=True, +) if args.use_every_nth_xtick > 1: ax.set_xticks(ax.get_xticks()[::2]) ax.legend(_legend[1:]) -ax.set_xlabel('Intensity bins') +ax.set_xlabel("Intensity bins") ax.yaxis.set_major_formatter("{x:,.0f}") -fname = args.out_figures / f'0_{group}_val_test_split_freq_stacked_.pdf' +fname = args.out_figures / f"0_{group}_val_test_split_freq_stacked_.pdf" figures[fname.name] = fname pimmslearn.savefig(ax.get_figure(), fname) @@ -868,46 +907,50 @@ def join_as_str(seq): # %% tags=["hide-input"] ax = pimmslearn.plotting.data.plot_feat_median_over_prop_missing( - data=splits.train_X, type='scatter') -fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter_train' + data=splits.train_X, type="scatter" +) +fname = args.out_figures / f"0_{group}_intensity_median_vs_prop_missing_scatter_train" figures[fname.stem] = fname pimmslearn.savefig(ax.get_figure(), fname) # %% tags=["hide-input"] ax = pimmslearn.plotting.data.plot_feat_median_over_prop_missing( - data=splits.train_X, type='boxplot') -fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_train' + data=splits.train_X, type="boxplot" +) +fname = args.out_figures / f"0_{group}_intensity_median_vs_prop_missing_boxplot_train" figures[fname.stem] = fname pimmslearn.savefig(ax.get_figure(), fname) # %% tags=["hide-input"] -medians = (splits - .train_X - .median() - .astype(int) - ).to_frame('median_floor') +medians = (splits.train_X.median().astype(int)).to_frame("median_floor") -feat_with_median = medians.groupby('median_floor').size().rename('M feat') -medians = medians.join(feat_with_median, on='median_floor') +feat_with_median = medians.groupby("median_floor").size().rename("M feat") +medians = medians.join(feat_with_median, on="median_floor") medians = medians.apply(lambda s: "{:02,d} (N={:3,d})".format(*s), axis=1) fig, ax = plt.subplots(figsize=(6, 2)) s = 1 -s_axes = pd.DataFrame({'medians': medians, - 'Validation split': splits.val_y.notna().sum(), - 'Training split': splits.train_X.notna().sum()} - ).plot.box(by='medians', - boxprops=dict(linewidth=s), - flierprops=dict(markersize=s), - ax=ax) +s_axes = pd.DataFrame( + { + "medians": medians, + "Validation split": splits.val_y.notna().sum(), + "Training split": splits.train_X.notna().sum(), + } +).plot.box( + by="medians", boxprops=dict(linewidth=s), flierprops=dict(markersize=s), ax=ax +) for ax in s_axes: - _ = ax.set_xticklabels(ax.get_xticklabels(), - rotation=45, - horizontalalignment='right') - ax.set_xlabel(f'{args.feat_name_display.capitalize()} binned by their median intensity ' - f'(N {args.feat_name_display})') - _ = ax.set_ylabel('Frequency') -fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_val_train' + _ = ax.set_xticklabels( + ax.get_xticklabels(), rotation=45, horizontalalignment="right" + ) + ax.set_xlabel( + f"{args.feat_name_display.capitalize()} binned by their median intensity " + f"(N {args.feat_name_display})" + ) + _ = ax.set_ylabel("Frequency") +fname = ( + args.out_figures / f"0_{group}_intensity_median_vs_prop_missing_boxplot_val_train" +) figures[fname.stem] = fname pimmslearn.savefig(ax.get_figure(), fname) @@ -915,7 +958,7 @@ def join_as_str(seq): # ## Save parameters # %% tags=["hide-input"] -fname = args.folder_experiment / 'data_config.yaml' +fname = args.folder_experiment / "data_config.yaml" args.dump(fname) args diff --git a/tests/io/test_data_objects.py b/tests/io/test_data_objects.py index d4da76db8..cf73306b6 100644 --- a/tests/io/test_data_objects.py +++ b/tests/io/test_data_objects.py @@ -32,13 +32,13 @@ def test_select_max_by(): index_columns = ["Sequence", "Charge"] - selection_column = 'Score' + selection_column = "Score" df = pd.read_csv(io.StringIO(data)) - actual = select_max_by(df, - grouping_columns=index_columns, - selection_column=selection_column).set_index(index_columns) + actual = select_max_by( + df, grouping_columns=index_columns, selection_column=selection_column + ).set_index(index_columns) desired = pd.read_csv(io.StringIO(expected), index_col=index_columns) assert desired.equals(actual) diff --git a/tests/io/test_dataloaders.py b/tests/io/test_dataloaders.py index 195fe5f39..ef201e093 100644 --- a/tests/io/test_dataloaders.py +++ b/tests/io/test_dataloaders.py @@ -11,15 +11,17 @@ def test_get_dls(): N, M = 23, 11 X_train = create_random_df(N, M) N_valid = int(N * 0.3) - X_valid = create_random_df( - N_valid, M, prop_na=.1, start_idx=len(X_train)) + X_valid = create_random_df(N_valid, M, prop_na=0.1, start_idx=len(X_train)) dae_default_pipeline = sklearn.pipeline.Pipeline( - [('normalize', StandardScaler()), - ('impute', SimpleImputer(add_indicator=False))]) - transforms = VaepPipeline(df_train=X_train, - encode=dae_default_pipeline, - decode=['normalize']) + [ + ("normalize", StandardScaler()), + ("impute", SimpleImputer(add_indicator=False)), + ] + ) + transforms = VaepPipeline( + df_train=X_train, encode=dae_default_pipeline, decode=["normalize"] + ) BS = 4 dls = get_dls(train_X=X_train, valid_X=X_valid, transformer=transforms, bs=BS) assert len(dls.train_ds) == N diff --git a/tests/io/test_dataset.py b/tests/io/test_dataset.py index d80f5f577..895671d5e 100644 --- a/tests/io/test_dataset.py +++ b/tests/io/test_dataset.py @@ -19,10 +19,11 @@ def create_DataFrame(): data = np.arange(100).reshape(-1, 5) - data = pd.DataFrame(data, - index=(f'row_{i:02}' for i in range(data.shape[0])), - columns=(f'feat_{i:02}' for i in range(data.shape[1])) - ) + data = pd.DataFrame( + data, + index=(f"row_{i:02}" for i in range(data.shape[0])), + columns=(f"feat_{i:02}" for i in range(data.shape[1])), + ) return data @@ -306,16 +307,20 @@ def test_DatasetWithTargetSpecifyTarget_floats(): ds = datasets.DatasetWithTargetSpecifyTarget(df=data, targets=targets) torch.testing.assert_close( - ds[-1][0], torch.tensor([0., 1., 1., 1., - 1., 1., 1.], dtype=torch.float32) + ds[-1][0], + torch.tensor([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=torch.float32), ) torch.testing.assert_close( ds[-1][1], - torch.tensor([nan, 24.5716, nan, nan, - nan, 23.6247, 22.4436], dtype=torch.float32), - equal_nan=True) + torch.tensor( + [nan, 24.5716, nan, nan, nan, 23.6247, 22.4436], dtype=torch.float32 + ), + equal_nan=True, + ) torch.testing.assert_close( ds[-1][2], - torch.tensor([23.2892, 24.5716, nan, - nan, nan, 23.6247, 22.4436], dtype=torch.float32), - equal_nan=True) + torch.tensor( + [23.2892, 24.5716, nan, nan, nan, 23.6247, 22.4436], dtype=torch.float32 + ), + equal_nan=True, + ) diff --git a/tests/io/test_datasplits.py b/tests/io/test_datasplits.py index afccec99c..6d50e4d8b 100644 --- a/tests/io/test_datasplits.py +++ b/tests/io/test_datasplits.py @@ -7,15 +7,21 @@ N, M = 10, 4 X = np.random.rand(N, M) -df = (pd.DataFrame(X, - index=[f'sample_{i}' for i in range(N)], - columns=(f'feat_{i}' for i in range(M))) - .rename_axis('Sample ID') - .rename_axis('Feature Name', axis=1)) - -_splits = {'train_X': df.iloc[:int(N * 0.6)], - 'val_y': df.iloc[int(N * 0.6):int(N * 0.8)], - 'test_y': df.iloc[int(N * 0.8):]} +df = ( + pd.DataFrame( + X, + index=[f"sample_{i}" for i in range(N)], + columns=(f"feat_{i}" for i in range(M)), + ) + .rename_axis("Sample ID") + .rename_axis("Feature Name", axis=1) +) + +_splits = { + "train_X": df.iloc[: int(N * 0.6)], + "val_y": df.iloc[int(N * 0.6) : int(N * 0.8)], + "test_y": df.iloc[int(N * 0.8) :], +} def test_DataSplits_iter(): @@ -28,15 +34,26 @@ def test_DataSplits_iter(): def test_DataSplits_dir(): actual = sorted(dir(DataSplits(is_wide_format=False))) # expected = sorted(list(_splits)) - expected = ['dump', 'from_folder', 'interpolate', 'load', 'test_X', 'test_y', - 'to_long_format', 'to_wide_format', 'train_X', 'val_X', 'val_y'] + expected = [ + "dump", + "from_folder", + "interpolate", + "load", + "test_X", + "test_y", + "to_long_format", + "to_wide_format", + "train_X", + "val_X", + "val_y", + ] assert actual == expected def test_load_missing_dir(): splits = DataSplits(is_wide_format=False) with pytest.raises(AssertionError): - splits.load(folder='non_exisiting') + splits.load(folder="non_exisiting") def test_dump_empty(tmp_path): @@ -53,9 +70,9 @@ def test_dump_load(tmp_path): splits = DataSplits(is_wide_format=None) splits.load(folder=tmp_path, use_wide_format=True) - assert splits.train_X is not _splits['train_X'] + assert splits.train_X is not _splits["train_X"] - npt.assert_almost_equal(_splits['train_X'].values, splits.train_X) + npt.assert_almost_equal(_splits["train_X"].values, splits.train_X) # #ToDo: Index and Column names are not yet correctly set # assert splits.train_X.equals(_splits['train_X']) @@ -92,11 +109,11 @@ def test_interpolate(): splits = DataSplits(**_splits, is_wide_format=True) splits._is_wide = True # ToDo. Is not correctly set when init is called. with pytest.raises(AttributeError): - _ = splits.interpolate('non-existing') + _ = splits.interpolate("non-existing") - _ = splits.interpolate('train_X') + _ = splits.interpolate("train_X") with pytest.raises(AttributeError): - _ = splits.interpolate('val_X') + _ = splits.interpolate("val_X") with pytest.raises(TypeError): _ = splits.interpolate(4) diff --git a/tests/pandas/test_calc_errors.py b/tests/pandas/test_calc_errors.py index 8af9ffd6f..40d2791c1 100644 --- a/tests/pandas/test_calc_errors.py +++ b/tests/pandas/test_calc_errors.py @@ -4,200 +4,250 @@ from pimmslearn.pandas import calc_errors -@fixture(scope='module') +@fixture(scope="module") def example_data(): """Example data with duplicated index values. Normally MulitIndex is used with unique combination of sample and feat values.""" - data = [[25.47317633, 27.23206642, 26.43510602, 28.40661375, 27.6536975], - [30.57866718, 30.17035425, 30.22881888, 29.82725333, 30.1177242], - [31.75756084, 31.72259066, 31.7095658, 31.78082111, 31.64350377], - [28.75620915, 28.43822661, 28.81199378, 28.56780444, 29.47071631], - [28.56543566, 29.36447242, 29.36242078, 29.26300889, 29.54972708], - [26.42588817, 25.34738488, 25.8179272, 25.12474125, 27.63014646], - [29.42071407, 29.94044252, 29.90296812, 30.049005, 30.24881789], - [29.95254851, 29.69691913, 29.71156873, 29.90902556, 30.0209686], - [26.47991271, 26.12002046, 26.16896366, 25.73061, 25.93545821], - [28.93845563, 29.29660334, 29.04327331, 28.912886, 28.79694124]] - data = pd.DataFrame(data, - index=(f'feat_{i}' for i in [ - 0, 0, 1, 1, 1, 2, 3, 4, 5, 6]), - columns=['observed'] + ['model_' + str(i + 1) for i in range(4)]) - data.columns.name = 'model' - data.index.name = 'feat' - data['freq_feat'] = [4, 4, 5, 5, 5, 6, 7, 9, 8, 6] + data = [ + [25.47317633, 27.23206642, 26.43510602, 28.40661375, 27.6536975], + [30.57866718, 30.17035425, 30.22881888, 29.82725333, 30.1177242], + [31.75756084, 31.72259066, 31.7095658, 31.78082111, 31.64350377], + [28.75620915, 28.43822661, 28.81199378, 28.56780444, 29.47071631], + [28.56543566, 29.36447242, 29.36242078, 29.26300889, 29.54972708], + [26.42588817, 25.34738488, 25.8179272, 25.12474125, 27.63014646], + [29.42071407, 29.94044252, 29.90296812, 30.049005, 30.24881789], + [29.95254851, 29.69691913, 29.71156873, 29.90902556, 30.0209686], + [26.47991271, 26.12002046, 26.16896366, 25.73061, 25.93545821], + [28.93845563, 29.29660334, 29.04327331, 28.912886, 28.79694124], + ] + data = pd.DataFrame( + data, + index=(f"feat_{i}" for i in [0, 0, 1, 1, 1, 2, 3, 4, 5, 6]), + columns=["observed"] + ["model_" + str(i + 1) for i in range(4)], + ) + data.columns.name = "model" + data.index.name = "feat" + data["freq_feat"] = [4, 4, 5, 5, 5, 6, 7, 9, 8, 6] return data def test_get_absolute_error(example_data): - expected = {'feat': {0: 'feat_0', - 1: 'feat_0', - 2: 'feat_1', - 3: 'feat_1', - 4: 'feat_1', - 5: 'feat_2', - 6: 'feat_3', - 7: 'feat_4', - 8: 'feat_5', - 9: 'feat_6'}, - 'model_1': {0: 1.7588900899999977, - 1: 0.408312930000001, - 2: 0.03497017999999841, - 3: 0.31798253999999915, - 4: 0.7990367599999999, - 5: 1.0785032900000004, - 6: 0.5197284500000023, - 7: 0.25562937999999846, - 8: 0.35989225000000147, - 9: 0.3581477100000008}, - 'model_2': {0: 0.9619296899999981, - 1: 0.34984830000000144, - 2: 0.04799503999999999, - 3: 0.055784630000001556, - 4: 0.7969851200000022, - 5: 0.6079609700000006, - 6: 0.48225405000000166, - 7: 0.24097977999999998, - 8: 0.3109490500000014, - 9: 0.10481768000000002}, - 'model_3': {0: 2.9334374200000006, - 1: 0.7514138499999987, - 2: 0.023260270000001526, - 3: 0.18840471000000036, - 4: 0.6975732299999997, - 5: 1.3011469200000008, - 6: 0.6282909300000021, - 7: 0.04352294999999984, - 8: 0.749302710000002, - 9: 0.025569629999999677}, - 'model_4': {0: 2.1805211699999987, - 1: 0.46094297999999867, - 2: 0.1140570700000012, - 3: 0.7145071600000001, - 4: 0.9842914200000017, - 5: 1.2042582899999985, - 6: 0.8281038200000026, - 7: 0.06842009000000004, - 8: 0.5444545000000005, - 9: 0.1415143900000011}} + expected = { + "feat": { + 0: "feat_0", + 1: "feat_0", + 2: "feat_1", + 3: "feat_1", + 4: "feat_1", + 5: "feat_2", + 6: "feat_3", + 7: "feat_4", + 8: "feat_5", + 9: "feat_6", + }, + "model_1": { + 0: 1.7588900899999977, + 1: 0.408312930000001, + 2: 0.03497017999999841, + 3: 0.31798253999999915, + 4: 0.7990367599999999, + 5: 1.0785032900000004, + 6: 0.5197284500000023, + 7: 0.25562937999999846, + 8: 0.35989225000000147, + 9: 0.3581477100000008, + }, + "model_2": { + 0: 0.9619296899999981, + 1: 0.34984830000000144, + 2: 0.04799503999999999, + 3: 0.055784630000001556, + 4: 0.7969851200000022, + 5: 0.6079609700000006, + 6: 0.48225405000000166, + 7: 0.24097977999999998, + 8: 0.3109490500000014, + 9: 0.10481768000000002, + }, + "model_3": { + 0: 2.9334374200000006, + 1: 0.7514138499999987, + 2: 0.023260270000001526, + 3: 0.18840471000000036, + 4: 0.6975732299999997, + 5: 1.3011469200000008, + 6: 0.6282909300000021, + 7: 0.04352294999999984, + 8: 0.749302710000002, + 9: 0.025569629999999677, + }, + "model_4": { + 0: 2.1805211699999987, + 1: 0.46094297999999867, + 2: 0.1140570700000012, + 3: 0.7145071600000001, + 4: 0.9842914200000017, + 5: 1.2042582899999985, + 6: 0.8281038200000026, + 7: 0.06842009000000004, + 8: 0.5444545000000005, + 9: 0.1415143900000011, + }, + } - actual = calc_errors.get_absolute_error( - example_data.drop('freq_feat', axis=1)).reset_index().to_dict() + actual = ( + calc_errors.get_absolute_error(example_data.drop("freq_feat", axis=1)) + .reset_index() + .to_dict() + ) assert actual == expected def test_calc_errors_per_feat(example_data): - expected = {'feat': {0: 'feat_0', - 1: 'feat_0', - 2: 'feat_1', - 3: 'feat_1', - 4: 'feat_1', - 5: 'feat_2', - 6: 'feat_6', - 7: 'feat_3', - 8: 'feat_5', - 9: 'feat_4'}, - 'model_1': {0: 1.0836015099999994, - 1: 1.0836015099999994, - 2: 0.38399649333333247, - 3: 0.38399649333333247, - 4: 0.38399649333333247, - 5: 1.0785032900000004, - 6: 0.3581477100000008, - 7: 0.5197284500000023, - 8: 0.35989225000000147, - 9: 0.25562937999999846}, - 'model_2': {0: 0.6558889949999998, - 1: 0.6558889949999998, - 2: 0.30025493000000125, - 3: 0.30025493000000125, - 4: 0.30025493000000125, - 5: 0.6079609700000006, - 6: 0.10481768000000002, - 7: 0.48225405000000166, - 8: 0.3109490500000014, - 9: 0.24097977999999998}, - 'model_3': {0: 1.8424256349999997, - 1: 1.8424256349999997, - 2: 0.3030794033333339, - 3: 0.3030794033333339, - 4: 0.3030794033333339, - 5: 1.3011469200000008, - 6: 0.025569629999999677, - 7: 0.6282909300000021, - 8: 0.749302710000002, - 9: 0.04352294999999984}, - 'model_4': {0: 1.3207320749999987, - 1: 1.3207320749999987, - 2: 0.6042852166666677, - 3: 0.6042852166666677, - 4: 0.6042852166666677, - 5: 1.2042582899999985, - 6: 0.1415143900000011, - 7: 0.8281038200000026, - 8: 0.5444545000000005, - 9: 0.06842009000000004}, - 'freq_feat': {0: 4, 1: 4, 2: 5, 3: 5, 4: 5, 5: 6, 6: 6, 7: 7, 8: 8, 9: 9}, - 'n_obs': {0: 2, 1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}} - actual = calc_errors.calc_errors_per_feat( - pred=example_data.drop('freq_feat', axis=1), - freq_feat=example_data['freq_feat']).reset_index().to_dict() + expected = { + "feat": { + 0: "feat_0", + 1: "feat_0", + 2: "feat_1", + 3: "feat_1", + 4: "feat_1", + 5: "feat_2", + 6: "feat_6", + 7: "feat_3", + 8: "feat_5", + 9: "feat_4", + }, + "model_1": { + 0: 1.0836015099999994, + 1: 1.0836015099999994, + 2: 0.38399649333333247, + 3: 0.38399649333333247, + 4: 0.38399649333333247, + 5: 1.0785032900000004, + 6: 0.3581477100000008, + 7: 0.5197284500000023, + 8: 0.35989225000000147, + 9: 0.25562937999999846, + }, + "model_2": { + 0: 0.6558889949999998, + 1: 0.6558889949999998, + 2: 0.30025493000000125, + 3: 0.30025493000000125, + 4: 0.30025493000000125, + 5: 0.6079609700000006, + 6: 0.10481768000000002, + 7: 0.48225405000000166, + 8: 0.3109490500000014, + 9: 0.24097977999999998, + }, + "model_3": { + 0: 1.8424256349999997, + 1: 1.8424256349999997, + 2: 0.3030794033333339, + 3: 0.3030794033333339, + 4: 0.3030794033333339, + 5: 1.3011469200000008, + 6: 0.025569629999999677, + 7: 0.6282909300000021, + 8: 0.749302710000002, + 9: 0.04352294999999984, + }, + "model_4": { + 0: 1.3207320749999987, + 1: 1.3207320749999987, + 2: 0.6042852166666677, + 3: 0.6042852166666677, + 4: 0.6042852166666677, + 5: 1.2042582899999985, + 6: 0.1415143900000011, + 7: 0.8281038200000026, + 8: 0.5444545000000005, + 9: 0.06842009000000004, + }, + "freq_feat": {0: 4, 1: 4, 2: 5, 3: 5, 4: 5, 5: 6, 6: 6, 7: 7, 8: 8, 9: 9}, + "n_obs": {0: 2, 1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}, + } + actual = ( + calc_errors.calc_errors_per_feat( + pred=example_data.drop("freq_feat", axis=1), + freq_feat=example_data["freq_feat"], + ) + .reset_index() + .to_dict() + ) assert actual == expected def test_calc_errors_per_bin(example_data): - expected = {'feat': {0: 'feat_0', - 1: 'feat_0', - 2: 'feat_1', - 3: 'feat_5', - 4: 'feat_2', - 5: 'feat_4', - 6: 'feat_3', - 7: 'feat_1', - 8: 'feat_6', - 9: 'feat_1'}, - 'model_1': {0: 1.7588900899999977, - 1: 0.408312930000001, - 2: 0.03497017999999841, - 3: 0.35989225000000147, - 4: 1.0785032900000004, - 5: 0.25562937999999846, - 6: 0.5197284500000023, - 7: 0.31798253999999915, - 8: 0.3581477100000008, - 9: 0.7990367599999999}, - 'model_2': {0: 0.9619296899999981, - 1: 0.34984830000000144, - 2: 0.04799503999999999, - 3: 0.3109490500000014, - 4: 0.6079609700000006, - 5: 0.24097977999999998, - 6: 0.48225405000000166, - 7: 0.055784630000001556, - 8: 0.10481768000000002, - 9: 0.7969851200000022}, - 'model_3': {0: 2.9334374200000006, - 1: 0.7514138499999987, - 2: 0.023260270000001526, - 3: 0.749302710000002, - 4: 1.3011469200000008, - 5: 0.04352294999999984, - 6: 0.6282909300000021, - 7: 0.18840471000000036, - 8: 0.025569629999999677, - 9: 0.6975732299999997}, - 'model_4': {0: 2.1805211699999987, - 1: 0.46094297999999867, - 2: 0.1140570700000012, - 3: 0.5444545000000005, - 4: 1.2042582899999985, - 5: 0.06842009000000004, - 6: 0.8281038200000026, - 7: 0.7145071600000001, - 8: 0.1415143900000011, - 9: 0.9842914200000017}, - 'bin': {0: 25, 1: 30, 2: 31, 3: 26, 4: 26, 5: 29, 6: 29, 7: 28, 8: 28, 9: 28}, - 'n_obs': {0: 1, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3, 9: 3}} - actual = calc_errors.calc_errors_per_bin( - example_data.drop('freq_feat', axis=1) - ).sort_values(["n_obs", "bin", "model_1"]).reset_index().to_dict() + expected = { + "feat": { + 0: "feat_0", + 1: "feat_0", + 2: "feat_1", + 3: "feat_5", + 4: "feat_2", + 5: "feat_4", + 6: "feat_3", + 7: "feat_1", + 8: "feat_6", + 9: "feat_1", + }, + "model_1": { + 0: 1.7588900899999977, + 1: 0.408312930000001, + 2: 0.03497017999999841, + 3: 0.35989225000000147, + 4: 1.0785032900000004, + 5: 0.25562937999999846, + 6: 0.5197284500000023, + 7: 0.31798253999999915, + 8: 0.3581477100000008, + 9: 0.7990367599999999, + }, + "model_2": { + 0: 0.9619296899999981, + 1: 0.34984830000000144, + 2: 0.04799503999999999, + 3: 0.3109490500000014, + 4: 0.6079609700000006, + 5: 0.24097977999999998, + 6: 0.48225405000000166, + 7: 0.055784630000001556, + 8: 0.10481768000000002, + 9: 0.7969851200000022, + }, + "model_3": { + 0: 2.9334374200000006, + 1: 0.7514138499999987, + 2: 0.023260270000001526, + 3: 0.749302710000002, + 4: 1.3011469200000008, + 5: 0.04352294999999984, + 6: 0.6282909300000021, + 7: 0.18840471000000036, + 8: 0.025569629999999677, + 9: 0.6975732299999997, + }, + "model_4": { + 0: 2.1805211699999987, + 1: 0.46094297999999867, + 2: 0.1140570700000012, + 3: 0.5444545000000005, + 4: 1.2042582899999985, + 5: 0.06842009000000004, + 6: 0.8281038200000026, + 7: 0.7145071600000001, + 8: 0.1415143900000011, + 9: 0.9842914200000017, + }, + "bin": {0: 25, 1: 30, 2: 31, 3: 26, 4: 26, 5: 29, 6: 29, 7: 28, 8: 28, 9: 28}, + "n_obs": {0: 1, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3, 9: 3}, + } + actual = ( + calc_errors.calc_errors_per_bin(example_data.drop("freq_feat", axis=1)) + .sort_values(["n_obs", "bin", "model_1"]) + .reset_index() + .to_dict() + ) assert actual == expected diff --git a/tests/plotting/test_defaults.py b/tests/plotting/test_defaults.py index 2fbec8075..4f2b43aa3 100644 --- a/tests/plotting/test_defaults.py +++ b/tests/plotting/test_defaults.py @@ -2,8 +2,10 @@ def test_assign_colors(): - expected = [(0.8392156862745098, 0.15294117647058825, 0.1568627450980392), - (0.17254901960784313, 0.6274509803921569, 0.17254901960784313), - (0.21044753832183283, 0.6773105080456748, 0.6433941168468681)] - assigned = assign_colors(['DAE', 'CF', 'Test']) + expected = [ + (0.8392156862745098, 0.15294117647058825, 0.1568627450980392), + (0.17254901960784313, 0.6274509803921569, 0.17254901960784313), + (0.21044753832183283, 0.6773105080456748, 0.6433941168468681), + ] + assigned = assign_colors(["DAE", "CF", "Test"]) assert assigned == expected diff --git a/tests/plotting/test_errors.py b/tests/plotting/test_errors.py index b2f8f8c1f..c2aa9ea7b 100644 --- a/tests/plotting/test_errors.py +++ b/tests/plotting/test_errors.py @@ -4,60 +4,64 @@ import pandas as pd import pytest from matplotlib.testing.decorators import image_comparison -from pimmslearn.plotting.errors import (get_data_for_errors_by_median, - plot_errors_by_median) - -TOP_N_COLOR_PALETTE = {'TRKNN': (0.20125317221201128, 0.6907920815379025, 0.47966761189275336), - 'KNN': (0.12156862745098039, 0.4666666666666667, 0.7058823529411765), - 'RF': (0.5490196078431373, 0.33725490196078434, 0.29411764705882354), - 'KNN_IMPUTE': (1.0, 0.4980392156862745, 0.054901960784313725), - 'SEQKNN': (0.7632105624545802, 0.5838460616396939, 0.19465686802007026)} +from pimmslearn.plotting.errors import ( + get_data_for_errors_by_median, + plot_errors_by_median, +) + +TOP_N_COLOR_PALETTE = { + "TRKNN": (0.20125317221201128, 0.6907920815379025, 0.47966761189275336), + "KNN": (0.12156862745098039, 0.4666666666666667, 0.7058823529411765), + "RF": (0.5490196078431373, 0.33725490196078434, 0.29411764705882354), + "KNN_IMPUTE": (1.0, 0.4980392156862745, 0.054901960784313725), + "SEQKNN": (0.7632105624545802, 0.5838460616396939, 0.19465686802007026), +} file_dir = Path(__file__).resolve().parent -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def example_pred_loaded(): """ Fixture to load example data from a csv file for testing. """ - example_data_path = file_dir / 'pred_testing_example.csv' + example_data_path = file_dir / "pred_testing_example.csv" return pd.read_csv(example_data_path, index_col=[0, 1]) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def feat_medians(): - medians_path = file_dir / 'test_medians.csv' + medians_path = file_dir / "test_medians.csv" s = pd.read_csv(medians_path, index_col=0).squeeze() return s -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def expected_errors_binned(): - errors_binned_path = file_dir / 'exp_errors_binned.csv' - df = pd.read_csv(errors_binned_path, sep=',', index_col=0) - col_cat = 'intensity binned by median of Gene Names' + errors_binned_path = file_dir / "exp_errors_binned.csv" + df = pd.read_csv(errors_binned_path, sep=",", index_col=0) + col_cat = "intensity binned by median of Gene Names" # ! Windows reads in new line in string characters as '\r\n' - df[col_cat] = df[col_cat].str.replace('\r\n', '\n') - df = df.astype({col_cat: 'category'}) + df[col_cat] = df[col_cat].str.replace("\r\n", "\n") + df = df.astype({col_cat: "category"}) return df -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def expected_plotted(): - plotted_path = file_dir / 'expected_plotted.csv' + plotted_path = file_dir / "expected_plotted.csv" # ! Windows reads in new line in string characters as '\r\n' - df = pd.read_csv(plotted_path, sep=',', index_col=0) - df["bin"] = df["bin"].str.replace('\r\n', '\n').astype('category') - df = df.sort_values(by=['bin', 'model']).reset_index(drop=True) + df = pd.read_csv(plotted_path, sep=",", index_col=0) + df["bin"] = df["bin"].str.replace("\r\n", "\n").astype("category") + df = df.sort_values(by=["bin", "model"]).reset_index(drop=True) return df def test_get_data_for_errors_by_median(expected_plotted, expected_errors_binned): plotted = get_data_for_errors_by_median( errors=expected_errors_binned, - feat_name='Gene Names', - metric_name='MAE', + feat_name="Gene Names", + metric_name="MAE", seed=42, ) @@ -66,18 +70,22 @@ def test_get_data_for_errors_by_median(expected_plotted, expected_errors_binned) # @image_comparison(baseline_images=['errors_by_median'], remove_text=True, # extensions=['png'], style='mpl20') -def test_plot_errors_by_median(example_pred_loaded, feat_medians, expected_errors_binned): +def test_plot_errors_by_median( + example_pred_loaded, feat_medians, expected_errors_binned +): fig, ax = plt.subplots(figsize=(8, 3)) ax, errors_binned = plot_errors_by_median( example_pred_loaded, feat_medians=feat_medians, ax=ax, - feat_name='Gene Names', + feat_name="Gene Names", palette=TOP_N_COLOR_PALETTE, - metric_name='MAE',) + metric_name="MAE", + ) ax.set_ylabel("Average error (MAE)") - ax.legend(loc='best', ncols=5) + ax.legend(loc="best", ncols=5) fig.tight_layout() - pd.testing.assert_frame_equal(errors_binned, expected_errors_binned, - check_dtype=False) + pd.testing.assert_frame_equal( + errors_binned, expected_errors_binned, check_dtype=False + ) diff --git a/tests/test_ae.py b/tests/test_ae.py index 6b3988ebc..a79674cc3 100644 --- a/tests/test_ae.py +++ b/tests/test_ae.py @@ -1,7 +1,6 @@ import pimmslearn from pimmslearn.models import ae - expected_repr = """Autoencoder( (encoder): Sequential( (0): Linear(in_features=100, out_features=30, bias=True) diff --git a/tests/test_collab.py b/tests/test_collab.py index 42a728715..01f501e74 100644 --- a/tests/test_collab.py +++ b/tests/test_collab.py @@ -9,13 +9,15 @@ N, M = 10, 4 X = np.random.rand(N, M) -df = pd.DataFrame(X, - index=[f'sample_{i}' for i in range(N)], - columns=(f'feat_{i}' for i in range(M))) +df = pd.DataFrame( + X, index=[f"sample_{i}" for i in range(N)], columns=(f"feat_{i}" for i in range(M)) +) -data = {'train_X': df.iloc[:int(N * 0.6)], - 'val_y': df.iloc[int(N * 0.6):int(N * 0.8)], - 'test_y': df.iloc[int(N * 0.8):]} +data = { + "train_X": df.iloc[: int(N * 0.6)], + "val_y": df.iloc[int(N * 0.6) : int(N * 0.8)], + "test_y": df.iloc[int(N * 0.8) :], +} data = DataSplits(**data, is_wide_format=True) assert data._is_wide diff --git a/tests/test_imports.py b/tests/test_imports.py index 3ce8e383e..f69850ff6 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -1,6 +1,6 @@ - def test_imports(): import pimmslearn.analyzers import pimmslearn.sklearn + print(pimmslearn.analyzers.__doc__) print(pimmslearn.sklearn.__doc__) diff --git a/tests/test_imputation.py b/tests/test_imputation.py index 61cd3068f..e433e2774 100644 --- a/tests/test_imputation.py +++ b/tests/test_imputation.py @@ -13,6 +13,7 @@ data.apply(numpy.random.shuffle, axis=1) data.to_csv('test_data.csv') """ + from pathlib import Path import numpy as np @@ -27,16 +28,17 @@ def example_data(): """ Fixture to load example data from a csv file for testing. """ - example_data_path = Path(__file__).resolve().parent / 'test_data.csv' - return pd.read_csv(example_data_path, index_col='id') + example_data_path = Path(__file__).resolve().parent / "test_data.csv" + return pd.read_csv(example_data_path, index_col="id") -@pytest.mark.parametrize('axis', [0, 1]) +@pytest.mark.parametrize("axis", [0, 1]) def test_impute_shifted_normal(example_data, axis): mean_shift = 1.8 # remove zeros as these lead to -inf - example_data = np.log2(example_data.replace({0.0: np.nan}) - ).dropna(thresh=10, axis=1 - axis) + example_data = np.log2(example_data.replace({0.0: np.nan})).dropna( + thresh=10, axis=1 - axis + ) N, M = example_data.shape mask_observed = example_data.notna() imputed = impute_shifted_normal(example_data, axis=axis, mean_shift=mean_shift) diff --git a/tests/test_io.py b/tests/test_io.py index 05715c103..df314a71f 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -4,9 +4,9 @@ def test_relative_to(): - fpath = Path('project/runs/experiment_name/run') - pwd = 'project/runs/' # per defaut '.' (the current working directory) - expected = Path('experiment_name/run') + fpath = Path("project/runs/experiment_name/run") + pwd = "project/runs/" # per defaut '.' (the current working directory) + expected = Path("experiment_name/run") acutal = pimmslearn.io.resolve_path(fpath, pwd) assert expected == acutal diff --git a/tests/test_models/test_collect_dumps.py b/tests/test_models/test_collect_dumps.py index d08b0c4b2..8ac3f7082 100644 --- a/tests/test_models/test_collect_dumps.py +++ b/tests/test_models/test_collect_dumps.py @@ -2,11 +2,11 @@ def test_select_content(): - test_cases = ['model_metrics_HL_1024_512_256_dae', - 'model_metrics_HL_1024_512_vae', - 'model_metrics_collab'] - expected = ['HL_1024_512_256', - 'HL_1024_512', - 'collab'] + test_cases = [ + "model_metrics_HL_1024_512_256_dae", + "model_metrics_HL_1024_512_vae", + "model_metrics_collab", + ] + expected = ["HL_1024_512_256", "HL_1024_512", "collab"] for test_case, v in zip(test_cases, expected): - assert select_content(test_case, first_split='metrics_') == v + assert select_content(test_case, first_split="metrics_") == v diff --git a/tests/test_models/test_transformers.py b/tests/test_models/test_transformers.py index 3147efb2a..c02a9272a 100644 --- a/tests/test_models/test_transformers.py +++ b/tests/test_models/test_transformers.py @@ -1,4 +1,5 @@ """Test scikit-learn transformers provided by PIMMS.""" + import numpy as np import pandas as pd import pytest @@ -7,11 +8,11 @@ from pimmslearn.sklearn.cf_transformer import CollaborativeFilteringTransformer -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def load_pkg_sample_data(): - test_data = 'project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50_M227.csv' - index_name = 'Sample ID' - column_name = 'protein group' + test_data = "project/data/dev_datasets/HeLa_6070/protein_groups_wide_N50_M227.csv" + index_name = "Sample ID" + column_name = "protein group" # read data, name index and columns df = pd.read_csv(test_data, index_col=0) df = np.log2(df + 1) @@ -21,13 +22,14 @@ def load_pkg_sample_data(): def test_CollaborativeFilteringTransformer(load_pkg_sample_data): - index_name = 'Sample ID' - column_name = 'protein group' - value_name = 'intensity' + index_name = "Sample ID" + column_name = "protein group" + value_name = "intensity" model = CollaborativeFilteringTransformer( target_column=value_name, sample_column=index_name, - item_column=column_name,) + item_column=column_name, + ) series = load_pkg_sample_data.stack() series.name = value_name # ! important # run for 2 epochs @@ -36,19 +38,22 @@ def test_CollaborativeFilteringTransformer(load_pkg_sample_data): assert df_imputed.isna().sum().sum() == 0 -@pytest.mark.parametrize("model", ['DAE', 'VAE']) +@pytest.mark.parametrize("model", ["DAE", "VAE"]) def test_AETransformer(model, load_pkg_sample_data): df = load_pkg_sample_data model = AETransformer( model=model, - hidden_layers=[512,], + hidden_layers=[ + 512, + ], latent_dim=50, - out_folder='runs/scikit_interface', + out_folder="runs/scikit_interface", batch_size=10, ) - model.fit(df, - cuda=False, - epochs_max=2, - ) + model.fit( + df, + cuda=False, + epochs_max=2, + ) df_imputed = model.transform(df) assert df_imputed.isna().sum().sum() == 0 diff --git a/tests/test_nb.py b/tests/test_nb.py index 5d4b28ad4..058bdc239 100644 --- a/tests/test_nb.py +++ b/tests/test_nb.py @@ -4,6 +4,6 @@ def test_Config(): cfg = Config() - cfg.test = 'test' + cfg.test = "test" with pytest.raises(AttributeError): - cfg.test = 'raise AttributeError' + cfg.test = "raise AttributeError" diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 7fa645cca..6792bcf66 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -19,8 +19,8 @@ def test_interpolate(): # floating point problem: numbers are not treated as decimals expected = { - (0, 'pep1'): (27.8 + 28.9) / 2, - (0, 'pep3'): 23.6, + (0, "pep1"): (27.8 + 28.9) / 2, + (0, "pep3"): 23.6, (1, "pep2"): (29.1 + 27.6) / 2, (1, "pep3"): 23.6, (3, "pep1"): (28.9 + 28.7) / 2, @@ -40,14 +40,13 @@ def test_interpolate(): def test_flatten_dict_of_dicts(): - expected = {('a', 'a1', 'a2'): 1, - ('a', 'a1', 'a3'): 2, - ('b', 'b1', 'b2'): 3, - ('b', 'b1', 'b3'): 4} - data = { - "a": {'a1': {'a2': 1, 'a3': 2}}, - "b": {'b1': {'b2': 3, 'b3': 4}} + expected = { + ("a", "a1", "a2"): 1, + ("a", "a1", "a3"): 2, + ("b", "b1", "b2"): 3, + ("b", "b1", "b3"): 4, } + data = {"a": {"a1": {"a2": 1, "a3": 2}}, "b": {"b1": {"b2": 3, "b3": 4}}} actual = pimmslearn.pandas.flatten_dict_of_dicts(data) assert expected == actual @@ -55,38 +54,48 @@ def test_flatten_dict_of_dicts(): def test_key_map(): # Build a schema of dicts - d = {'one': {'alpha': {'a': 0.5, 'b': 0.3}}, - 'two': {'beta': {'a': 0.7, 'b': 0.5}, - 'gamma': {'a': 0.8, 'b': 0.9}}, - 'three': {'alpha': {'a': 0.4, 'b': 0.4}, - 'beta': {'a': 0.6, 'b': 0.5}, - 'gamma': {'a': 0.7, 'b': 0.6}, - 'delta': {'a': 0.2, 'b': 0.8}} - } - expected = {'one': {'alpha': ('a', 'b')}, - 'two': {'beta': ('a', 'b'), - 'gamma': ('a', 'b')}, - 'three': {'alpha': ('a', 'b'), - 'beta': ('a', 'b'), - 'gamma': ('a', 'b'), - 'delta': ('a', 'b')}} + d = { + "one": {"alpha": {"a": 0.5, "b": 0.3}}, + "two": {"beta": {"a": 0.7, "b": 0.5}, "gamma": {"a": 0.8, "b": 0.9}}, + "three": { + "alpha": {"a": 0.4, "b": 0.4}, + "beta": {"a": 0.6, "b": 0.5}, + "gamma": {"a": 0.7, "b": 0.6}, + "delta": {"a": 0.2, "b": 0.8}, + }, + } + expected = { + "one": {"alpha": ("a", "b")}, + "two": {"beta": ("a", "b"), "gamma": ("a", "b")}, + "three": { + "alpha": ("a", "b"), + "beta": ("a", "b"), + "gamma": ("a", "b"), + "delta": ("a", "b"), + }, + } actual = pimmslearn.pandas.key_map(d) assert expected == actual - d = {'one': {'alpha': {'a': 0.5, 'b': 0.3}}, - 'two': {'beta': {'a': 0.7, 'b': 0.5}, - 'gamma': {'a': 0.8, 'b': 0.9}}, - 'three': {'alpha': {'a': 0.4, 'b': 0.4}, - 'beta': {'a': 0.6, 'b': 0.5}, - 'gamma': {'a': 0.7, 'b': 0.6}, - 'delta': 3} - } - expected = {'one': {'alpha': ('a', 'b')}, - 'two': {'beta': ('a', 'b'), - 'gamma': ('a', 'b')}, - 'three': {'alpha': ('a', 'b'), - 'beta': ('a', 'b'), - 'gamma': ('a', 'b'), - 'delta': None}} + d = { + "one": {"alpha": {"a": 0.5, "b": 0.3}}, + "two": {"beta": {"a": 0.7, "b": 0.5}, "gamma": {"a": 0.8, "b": 0.9}}, + "three": { + "alpha": {"a": 0.4, "b": 0.4}, + "beta": {"a": 0.6, "b": 0.5}, + "gamma": {"a": 0.7, "b": 0.6}, + "delta": 3, + }, + } + expected = { + "one": {"alpha": ("a", "b")}, + "two": {"beta": ("a", "b"), "gamma": ("a", "b")}, + "three": { + "alpha": ("a", "b"), + "beta": ("a", "b"), + "gamma": ("a", "b"), + "delta": None, + }, + } actual = pimmslearn.pandas.key_map(d) assert expected == actual diff --git a/tests/test_sampling.py b/tests/test_sampling.py index db7d887f2..b5a8c8c4d 100644 --- a/tests/test_sampling.py +++ b/tests/test_sampling.py @@ -12,7 +12,11 @@ @pytest.fixture def random_data(): """Fixture to load random data.""" - return create_random_df(100, 10, prop_na=0.1).rename_axis('Sample ID').rename_axis('feat name', axis=1) + return ( + create_random_df(100, 10, prop_na=0.1) + .rename_axis("Sample ID") + .rename_axis("feat name", axis=1) + ) @pytest.fixture @@ -20,24 +24,28 @@ def example_data(): """ Fixture to load example data from a csv file for testing. """ - example_data_path = Path(__file__).resolve().parent / 'test_data.csv' - return pd.read_csv(example_data_path, index_col='id').rename_axis('Sample ID').rename_axis('feat name', axis=1) + example_data_path = Path(__file__).resolve().parent / "test_data.csv" + return ( + pd.read_csv(example_data_path, index_col="id") + .rename_axis("Sample ID") + .rename_axis("feat name", axis=1) + ) def test_feature_frequency(random_data): X = random_data - assert all(feature_frequency(X) - == - frequency_by_index(to_long_format(X), - sample_index_to_drop='Sample ID')) + assert all( + feature_frequency(X) + == frequency_by_index(to_long_format(X), sample_index_to_drop="Sample ID") + ) def test_frequency_by_index(example_data): X = example_data - assert all(feature_frequency(X) - == - frequency_by_index(to_long_format(X), - sample_index_to_drop=0)) + assert all( + feature_frequency(X) + == frequency_by_index(to_long_format(X), sample_index_to_drop=0) + ) def test_sample_data(random_data): @@ -47,12 +55,11 @@ def test_sample_data(random_data): freq.loc[excluded_feat] = 0 X = to_long_format(X).squeeze() # ValueError: Fewer non-zero entries in p than size -> too many feat set to zero - series_sampled, series_not_sampled = sample_data( - X, 0, frac=0.70, weights=freq) - assert len(X) == len( - series_sampled) + len(series_not_sampled) + series_sampled, series_not_sampled = sample_data(X, 0, frac=0.70, weights=freq) + assert len(X) == len(series_sampled) + len(series_not_sampled) assert X.index.difference( - series_sampled.index.append(series_not_sampled.index)).empty + series_sampled.index.append(series_not_sampled.index) + ).empty idx_excluded = series_sampled.index.isin(excluded_feat, level=1) assert series_sampled.loc[idx_excluded].empty idx_excluded = series_not_sampled.index.isin(excluded_feat, level=1) diff --git a/tests/test_transfrom.py b/tests/test_transfrom.py index 5da59f2d8..9caf20d2f 100644 --- a/tests/test_transfrom.py +++ b/tests/test_transfrom.py @@ -9,166 +9,195 @@ from numpy import nan - data = { - 'feat_00': {'sample_023': 18.475502941566493, - 'sample_156': 22.535341434252544, - 'sample_088': 7.413097020304393, - 'sample_071': 22.127469047666743, - 'sample_040': 5.068051892164936, - 'sample_160': 10.21812499743875, - 'sample_046': 9.71037709212731, - 'sample_137': 20.07067011987325, - 'sample_180': 15.638881429624169, - 'sample_029': 21.0290739437736}, - 'feat_01': {'sample_023': 19.052809526029314, - 'sample_156': nan, - 'sample_088': 4.78634040339565, - 'sample_071': 7.1633143730490705, - 'sample_040': 8.35771017095876, - 'sample_160': nan, - 'sample_046': 12.763093158492504, - 'sample_137': 13.868866830018217, - 'sample_180': 29.280414698376408, - 'sample_029': 2.182890190925806}, - 'feat_02': {'sample_023': 1.3591202931613355, - 'sample_156': nan, - 'sample_088': 26.15350699776605, - 'sample_071': 3.3142233939418397, - 'sample_040': 5.3103145283024045, - 'sample_160': 12.28886684685696, - 'sample_046': 15.22831136053365, - 'sample_137': 12.352966247535255, - 'sample_180': 3.766506442879601, - 'sample_029': 24.655801778710686}, - 'feat_03': {'sample_023': 11.238378438794136, - 'sample_156': 10.643138755079526, - 'sample_088': 6.57641962074133, - 'sample_071': 10.638664729223295, - 'sample_040': 2.6610760127116686, - 'sample_160': 9.336524851901807, - 'sample_046': 7.2722919724524075, - 'sample_137': 19.529204014528624, - 'sample_180': nan, - 'sample_029': 21.187266814694887}, - 'feat_04': {'sample_023': 18.77579747142709, - 'sample_156': 28.25177134856775, - 'sample_088': 29.275957674573938, - 'sample_071': 8.617169749622452, - 'sample_040': 3.6190761330180243, - 'sample_160': 20.313619192050282, - 'sample_046': 3.445104742176106, - 'sample_137': 16.362956082207162, - 'sample_180': 23.10474635271288, - 'sample_029': 2.440463419256993}, - 'feat_05': {'sample_023': 15.094087757402631, - 'sample_156': 20.05717940168742, - 'sample_088': 10.106873753133206, - 'sample_071': 8.889243613679703, - 'sample_040': 13.823363040981773, - 'sample_160': 18.173377483580712, - 'sample_046': 18.318601273248976, - 'sample_137': 1.8681931764565685, - 'sample_180': 24.214666120946475, - 'sample_029': 2.5451314225557575}, - 'feat_06': {'sample_023': 25.694695235649668, - 'sample_156': 20.360098700176945, - 'sample_088': 5.463537470660978, - 'sample_071': 7.008232531497298, - 'sample_040': 6.190011552173775, - 'sample_160': 10.9378118459262, - 'sample_046': 8.658916597207673, - 'sample_137': 15.375079463109952, - 'sample_180': 3.6062168736172873, - 'sample_029': 29.599187355035262}, - 'feat_07': {'sample_023': 19.76080894856835, - 'sample_156': 10.857599725135948, - 'sample_088': 23.690955214274375, - 'sample_071': 1.2627956890908565, - 'sample_040': 10.928095831442263, - 'sample_160': 6.536776627649703, - 'sample_046': 17.437146642678368, - 'sample_137': 24.192108516985186, - 'sample_180': 7.967484988758182, - 'sample_029': 11.228123872683609}, - 'feat_08': {'sample_023': 4.888032812442891, - 'sample_156': 17.809823736256153, - 'sample_088': 19.761233265026284, - 'sample_071': 0.5362180420024143, - 'sample_040': 15.102518125645707, - 'sample_160': 29.641078984667757, - 'sample_046': 4.630881458226069, - 'sample_137': 13.777196327908742, - 'sample_180': nan, - 'sample_029': 11.119264412006727}, - 'feat_09': {'sample_023': 2.1170624220128955, - 'sample_156': nan, - 'sample_088': 14.945871493594169, - 'sample_071': 29.631671692080946, - 'sample_040': 20.71184485888096, - 'sample_160': 13.620048635883446, - 'sample_046': 14.434203055644524, - 'sample_137': 1.5586973266280524, - 'sample_180': 8.799258323416941, - 'sample_029': 24.383987017725076}, - 'feat_10': {'sample_023': 19.27257834618947, - 'sample_156': 19.082881392286268, - 'sample_088': 16.660906528128937, - 'sample_071': 12.83319401207587, - 'sample_040': 1.179364195232968, - 'sample_160': 20.64822707340711, - 'sample_046': 15.977682976547577, - 'sample_137': 23.588345981520995, - 'sample_180': 23.194256217560145, - 'sample_029': 28.417457321515762}, - 'feat_11': {'sample_023': 0.7953393162486544, - 'sample_156': 27.3986084272871, - 'sample_088': 21.576053348167914, - 'sample_071': 11.52979941479045, - 'sample_040': 23.98231196727128, - 'sample_160': 4.216589027350823, - 'sample_046': 1.5547061046728072, - 'sample_137': 6.040913464079232, - 'sample_180': 15.538815831377368, - 'sample_029': 29.580031914686128}, - 'feat_12': {'sample_023': 17.5732674382039, - 'sample_156': 18.377204050344574, - 'sample_088': 6.853642239938962, - 'sample_071': 20.389418480792095, - 'sample_040': 18.837011684727234, - 'sample_160': 14.567690989781934, - 'sample_046': 10.098128345817617, - 'sample_137': 7.758625046641163, - 'sample_180': 10.442867795048642, - 'sample_029': 22.601345557768248}, - 'feat_13': {'sample_023': 28.206907242748727, - 'sample_156': 26.21095780399035, - 'sample_088': 29.89001748170226, - 'sample_071': 6.5476166359519254, - 'sample_040': 2.4527709584661572, - 'sample_160': nan, - 'sample_046': 4.032440308169227, - 'sample_137': 4.941190602968877, - 'sample_180': 11.152178847189909, - 'sample_029': 11.287787565927474}, - 'feat_14': {'sample_023': 17.26422533627637, - 'sample_156': 21.719191663982816, - 'sample_088': 29.243794864403494, - 'sample_071': 28.49883551850676, - 'sample_040': 26.207358723203317, - 'sample_160': 15.163653537313499, - 'sample_046': 1.9012491141830323, - 'sample_137': 9.906451946144927, - 'sample_180': nan, - 'sample_029': 2.505021500960063} + "feat_00": { + "sample_023": 18.475502941566493, + "sample_156": 22.535341434252544, + "sample_088": 7.413097020304393, + "sample_071": 22.127469047666743, + "sample_040": 5.068051892164936, + "sample_160": 10.21812499743875, + "sample_046": 9.71037709212731, + "sample_137": 20.07067011987325, + "sample_180": 15.638881429624169, + "sample_029": 21.0290739437736, + }, + "feat_01": { + "sample_023": 19.052809526029314, + "sample_156": nan, + "sample_088": 4.78634040339565, + "sample_071": 7.1633143730490705, + "sample_040": 8.35771017095876, + "sample_160": nan, + "sample_046": 12.763093158492504, + "sample_137": 13.868866830018217, + "sample_180": 29.280414698376408, + "sample_029": 2.182890190925806, + }, + "feat_02": { + "sample_023": 1.3591202931613355, + "sample_156": nan, + "sample_088": 26.15350699776605, + "sample_071": 3.3142233939418397, + "sample_040": 5.3103145283024045, + "sample_160": 12.28886684685696, + "sample_046": 15.22831136053365, + "sample_137": 12.352966247535255, + "sample_180": 3.766506442879601, + "sample_029": 24.655801778710686, + }, + "feat_03": { + "sample_023": 11.238378438794136, + "sample_156": 10.643138755079526, + "sample_088": 6.57641962074133, + "sample_071": 10.638664729223295, + "sample_040": 2.6610760127116686, + "sample_160": 9.336524851901807, + "sample_046": 7.2722919724524075, + "sample_137": 19.529204014528624, + "sample_180": nan, + "sample_029": 21.187266814694887, + }, + "feat_04": { + "sample_023": 18.77579747142709, + "sample_156": 28.25177134856775, + "sample_088": 29.275957674573938, + "sample_071": 8.617169749622452, + "sample_040": 3.6190761330180243, + "sample_160": 20.313619192050282, + "sample_046": 3.445104742176106, + "sample_137": 16.362956082207162, + "sample_180": 23.10474635271288, + "sample_029": 2.440463419256993, + }, + "feat_05": { + "sample_023": 15.094087757402631, + "sample_156": 20.05717940168742, + "sample_088": 10.106873753133206, + "sample_071": 8.889243613679703, + "sample_040": 13.823363040981773, + "sample_160": 18.173377483580712, + "sample_046": 18.318601273248976, + "sample_137": 1.8681931764565685, + "sample_180": 24.214666120946475, + "sample_029": 2.5451314225557575, + }, + "feat_06": { + "sample_023": 25.694695235649668, + "sample_156": 20.360098700176945, + "sample_088": 5.463537470660978, + "sample_071": 7.008232531497298, + "sample_040": 6.190011552173775, + "sample_160": 10.9378118459262, + "sample_046": 8.658916597207673, + "sample_137": 15.375079463109952, + "sample_180": 3.6062168736172873, + "sample_029": 29.599187355035262, + }, + "feat_07": { + "sample_023": 19.76080894856835, + "sample_156": 10.857599725135948, + "sample_088": 23.690955214274375, + "sample_071": 1.2627956890908565, + "sample_040": 10.928095831442263, + "sample_160": 6.536776627649703, + "sample_046": 17.437146642678368, + "sample_137": 24.192108516985186, + "sample_180": 7.967484988758182, + "sample_029": 11.228123872683609, + }, + "feat_08": { + "sample_023": 4.888032812442891, + "sample_156": 17.809823736256153, + "sample_088": 19.761233265026284, + "sample_071": 0.5362180420024143, + "sample_040": 15.102518125645707, + "sample_160": 29.641078984667757, + "sample_046": 4.630881458226069, + "sample_137": 13.777196327908742, + "sample_180": nan, + "sample_029": 11.119264412006727, + }, + "feat_09": { + "sample_023": 2.1170624220128955, + "sample_156": nan, + "sample_088": 14.945871493594169, + "sample_071": 29.631671692080946, + "sample_040": 20.71184485888096, + "sample_160": 13.620048635883446, + "sample_046": 14.434203055644524, + "sample_137": 1.5586973266280524, + "sample_180": 8.799258323416941, + "sample_029": 24.383987017725076, + }, + "feat_10": { + "sample_023": 19.27257834618947, + "sample_156": 19.082881392286268, + "sample_088": 16.660906528128937, + "sample_071": 12.83319401207587, + "sample_040": 1.179364195232968, + "sample_160": 20.64822707340711, + "sample_046": 15.977682976547577, + "sample_137": 23.588345981520995, + "sample_180": 23.194256217560145, + "sample_029": 28.417457321515762, + }, + "feat_11": { + "sample_023": 0.7953393162486544, + "sample_156": 27.3986084272871, + "sample_088": 21.576053348167914, + "sample_071": 11.52979941479045, + "sample_040": 23.98231196727128, + "sample_160": 4.216589027350823, + "sample_046": 1.5547061046728072, + "sample_137": 6.040913464079232, + "sample_180": 15.538815831377368, + "sample_029": 29.580031914686128, + }, + "feat_12": { + "sample_023": 17.5732674382039, + "sample_156": 18.377204050344574, + "sample_088": 6.853642239938962, + "sample_071": 20.389418480792095, + "sample_040": 18.837011684727234, + "sample_160": 14.567690989781934, + "sample_046": 10.098128345817617, + "sample_137": 7.758625046641163, + "sample_180": 10.442867795048642, + "sample_029": 22.601345557768248, + }, + "feat_13": { + "sample_023": 28.206907242748727, + "sample_156": 26.21095780399035, + "sample_088": 29.89001748170226, + "sample_071": 6.5476166359519254, + "sample_040": 2.4527709584661572, + "sample_160": nan, + "sample_046": 4.032440308169227, + "sample_137": 4.941190602968877, + "sample_180": 11.152178847189909, + "sample_029": 11.287787565927474, + }, + "feat_14": { + "sample_023": 17.26422533627637, + "sample_156": 21.719191663982816, + "sample_088": 29.243794864403494, + "sample_071": 28.49883551850676, + "sample_040": 26.207358723203317, + "sample_160": 15.163653537313499, + "sample_046": 1.9012491141830323, + "sample_137": 9.906451946144927, + "sample_180": nan, + "sample_029": 2.505021500960063, + }, } def test_Vaep_Pipeline(): dae_default_pipeline = sklearn.pipeline.Pipeline( [ - ('normalize', preprocessing.StandardScaler()), - ('impute', impute.SimpleImputer(add_indicator=False)) # True won't work + ("normalize", preprocessing.StandardScaler()), + ("impute", impute.SimpleImputer(add_indicator=False)), # True won't work ] ) df = pd.DataFrame(data) @@ -182,14 +211,14 @@ def test_Vaep_Pipeline(): with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.iloc[0]) # pd.DataFrame with pytest.raises(ValueError): - _ = dae_transforms.inverse_transform(res.loc['sample_156']) # pd.DataFrame + _ = dae_transforms.inverse_transform(res.loc["sample_156"]) # pd.DataFrame with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(to_tensor(res)) # torch.Tensor with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.values) # numpy.array with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.values[0]) # single sample - dae_transforms = VaepPipeline(df, encode=dae_default_pipeline, decode=['normalize']) + dae_transforms = VaepPipeline(df, encode=dae_default_pipeline, decode=["normalize"]) res = dae_transforms.transform(df) res = dae_transforms.inverse_transform(res) npt.assert_array_almost_equal(df.values[mask], res.values[mask]) diff --git a/tests/test_utils.py b/tests/test_utils.py index 290cb66e7..260af5002 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,11 +3,13 @@ def test_append_to_filepath(): - fp = pathlib.Path('data/experiment_data.csv') + fp = pathlib.Path("data/experiment_data.csv") - fp_new = pathlib.Path('data/experiment_data_processed.csv') - assert append_to_filepath(filepath=fp, to_append='processed') == fp_new + fp_new = pathlib.Path("data/experiment_data_processed.csv") + assert append_to_filepath(filepath=fp, to_append="processed") == fp_new - fp_new = pathlib.Path('data/experiment_data_processed.pkl') - assert append_to_filepath( - filepath=fp, to_append='processed', new_suffix='pkl') == fp_new + fp_new = pathlib.Path("data/experiment_data_processed.pkl") + assert ( + append_to_filepath(filepath=fp, to_append="processed", new_suffix="pkl") + == fp_new + ) From 428d39d295662720ff1a3d457f352da3850a2624 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 13:46:06 +0100 Subject: [PATCH 09/12] :wrench: try to install tcltk manually for windows python 3.10 --- .github/workflows/ci.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b0af57f9b..c66e13462 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -115,6 +115,11 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Install prerequisites (Windows) + if: runner.os == 'Windows' + run: | + choco install tcltk + - name: install pimms run: pip install . From f9c8b435d6cbafa6a2e971214c1f36478216b1cf Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 14:20:00 +0100 Subject: [PATCH 10/12] :bug: just ignore 3.10 (will be out of life soon) --- .github/workflows/ci.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c66e13462..65e5e679c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -104,7 +104,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 with: @@ -115,11 +115,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install prerequisites (Windows) - if: runner.os == 'Windows' - run: | - choco install tcltk - - name: install pimms run: pip install . From 9ef5f7b45c35855efba003aae018193a43453f5d Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 14:44:48 +0100 Subject: [PATCH 11/12] :bug: isort only on package folder --- .github/workflows/ci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 65e5e679c..6680546e3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,6 +20,8 @@ jobs: with: src: pimmslearn - uses: isort/isort-action@v1 + with: + sortPaths: pimmslearn lint: name: Lint with ruff runs-on: ubuntu-latest From 986aadc9fb50ed41270033b2e49ffd897f4fc9d7 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Mon, 16 Feb 2026 14:45:08 +0100 Subject: [PATCH 12/12] :bug: exclude windows server image w/o tk (tcl) for now --- .github/workflows/ci.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 6680546e3..05ea24948 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -105,7 +105,11 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest", "macos-latest", "windows-latest"] + os: [ + "ubuntu-latest", + "macos-latest", + #"windows-latest" # tk/tcl issue in current runner images + ] python-version: ["3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4