CellNeighborEX-Wrapper/Arians_Stuff/WrapperTest/ccc_neighbor.py at main · Winterwind/CellNeighborEX-Wrapper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
__author__ = 'Arian Djahed'
__date__ = 'March 28, 2025'
__copyright__ = 'Copyright 2025, Bonsai Applied Computations Group'
__version__ = '1.0'

import pandas as pd
import anndata as ad
import numpy as np
import matplotlib.pyplot as plt
import os, warnings, importlib, pdf2image
import CellNeighborEX
import DEanalysis as de_analysis
import visualization as cnex_vis


# depracated function (contents moved to cell_cell_degs since that's the only function that uses this code anyway)
'''
def displayplots(celltype:str) -> None:
    # get the path for the plots
    rootpath = os.getcwd()
    path_deg = os.path.join(rootpath, 'DE_results/')

    heatmap_path = os.path.join(path_deg, f'{celltype}/{celltype}_heatmap.pdf')
    volcano_path = os.path.join(path_deg, f'{celltype}/{celltype}_volcano.pdf')

    heatmap = pdf2image.convert_from_path(heatmap_path)[0]
    volcano = pdf2image.convert_from_path(volcano_path)[0]

    if volcano and heatmap:
        #display heatmap
        plt.figure(figsize=(12, 6))
        plt.imshow(np.array(heatmap))
        plt.axis('off')
        plt.grid(False)
        plt.show(block=False)

        #display volcano plot
        plt.figure(figsize=(12, 6))
        plt.imshow(np.array(volcano))
        plt.axis('off')
        plt.grid(False)
        plt.show(block=False)
    else:
        print(f'No plots found for {celltype}.')
'''

def prepare_for_ccc(h5ad_path:str) -> pd.DataFrame:
    """
    Create the anndata object & dataframe with all neighbor-dependent celltypes for use in the subsequent differential expression analysis.

    Parameters
    ----------
        **h5ad_path**: *str*
            The FULL path to the h5ad file containing the MERFISH data to use

    Returns
    -------
        *tuple*
            A tuple containing adata and df_processed.
            - **adata**: The AnnData object made out of the Merfish data.
            - **df_processed**: The processed pandas datadrame containing the neighbors data from Delaunay triangulation.

    Notes
    -----
        This function MUST be executed before any other function in this module, as they all depend on the output of this function.
    """

    pd.options.mode.chained_assignment = None  # supress those pesky "SettingWithCopyWarning" warnings

    print(f'[1/4] Reading in AnnData object...')

    #h5ad_path = str(input('Enter the full path to the h5ad file: '))
    adata = ad.read_h5ad(h5ad_path)

    print(f'[2/4] Creating Preprocessed Data Frame...')

    # Create a dataframe from the provided AnnData object.
    # coord_key (str): Key to access the spatial coordinates in `adata.obsm`.
    # celltype_key (str): Key to access the cell type information in `adata.obs`.
    df =  CellNeighborEX.neighbors.create_dataframe(adata, coord_key='spatial', celltype_key='cell_type')

    print(f'[3/4] Constructing Neighbor Matrix...')

    # Find immediate neighbors using Delaunay triangulation and retrieve the spatial connectivity matrix.
    matrix = CellNeighborEX.neighbors.detect_neighbors(adata, coord_key='spatial', type='generic', knn=None, radius_value=None, delaunay=True)
    # Calculate the number of neighbors for each cell.
    neiNum = CellNeighborEX.neighbors.get_neighbors(matrix)

    print(f'[4/4] Processing Data Frame (this may take a bit)...')

    # Processes the dataframe by adding additional columns based on the neighbor matrix and neighbor counts.
    # Keeping save as True for later use with the spatial plot
    df_processed = CellNeighborEX.neighbors.process_dataframe(df, matrix, neiNum, save=True)
    return adata, df_processed


def neighbor_dependent_expression(adata:ad.AnnData, df_processed:pd.DataFrame, lrCutoff:float=0.2, pCutoff:float=0.05, pCutoff2:float=0.75, direction:str='up', normality_test:bool=False, top_genes:int=10, showplots:bool=False) -> pd.DataFrame:
    '''
    Perform the neighbor-dependent expression analysis to connect cell-cell interaction combinations to differentially expressed genes.

    Parameters
    ----------
        **adata** : *ad.AnnData*
            The anndata object created from the original h5ad input file by `prepare_for_ccc`.
        **df_processed** : *pd.DataFrame*
            The processed pandas dataframe containing the neighbors data from Delaunay triangulation (also from `prepare_for_ccc`).
        **lrCutoff** : *float*
            The threshold value for the log ratio (is `0.2` by default).
        **pCutoff** : *float*
            The threshold value for the p-value (is `0.05` by default).
        **pCutoff2** : *float*
            The threshold value for the false discovery rate (is `0.75` by default).
        **direction** : *str*
            The indicator to determine which direction is considered "positive"; `'up'` means that up-regulated genes are positive and `'down'` means that down-regulated genes are positive (is `'up'` by default).
        **normality_test** : *bool*
            `True`: depending on the result of the normality test, the statistical test is determined. If the data is normal, the parametric test is used. Otherwise, the non-parametric test is used.\n
            `False`: when sample size (number of cells/spots) is larger than 30, the parameteric test is used. Otherwise, the non-parametric test is used. (is `False` by default).
        **top_genes** : *int*
            The number of top differentially expressed genes to annotate in the volcano plot (is `10` by default).
        **showplots** : *bool*
            The indicator to determine whether or not to show the heatmaps and volcano plots for each celltype in the list (is `False` by default).

    Returns
    -------
        **deg_list** : *pd.DataFrame*
            The dataframe containing all of the differentially-expressed genes along with their celltypes and their respective p-value cutoffs, false discover rate cutoffs, and log ratios.

    Notes
    -----
        This function MUST be executed before executing `cell_cell_degs` and `is_gene_degs`, as they both depend on the output of this function.
    '''

    # Suppress warnings that may show up in this function
    pd.options.mode.chained_assignment = None
    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=UserWarning)

    print(f'[1/3] Generating input files...')

    # All categorzied files (index_, matchComb_, neiCombUnique_, prop_ .csv) are saved in the "categorized_data folder" in the root directory.
    # Note: there is no "save" parameter for this function, so it saves the files whether you like it or not
    CellNeighborEX.categorization.generate_input_files(data_type = "Image", df = df_processed, sample_size=30, min_sample_size=1)

    print(f'[2/3] Setting parameters to analyze the data...')

    rootpath = os.getcwd()
    path_categorization = os.path.join(rootpath, 'categorized_data/')
    barcodes = df_processed['barcode'].tolist()
    adata = adata[barcodes, :]

    # Save the data into dataframes.
    index_name = adata.obs.index.name
    df_cell_id = pd.DataFrame(adata.obs.index)
    df_cell_id = df_cell_id.rename(columns={index_name: 0})

    var_name = adata.var.index.name
    df_gene_name = pd.DataFrame(adata.var.index)
    df_gene_name = df_gene_name.rename(columns={var_name: 0})

    log_adata = adata.copy()
    log_adata.X = adata.layers["log_counts"]

    df_log_data = log_adata.to_df().T
    df_log_data = df_log_data.reset_index(drop=True) # row indices are represented as numbers.

    assert len(df_cell_id) == len(df_processed), "The number of cells in the dataframes do not match."

    print(f'[3/3] Peforming neighbor-dependent expression...')

    importlib.reload(de_analysis)  # Reload the module to reflect changes
    deg_list = de_analysis.analyze_data(df_cell_id, df_gene_name, df_log_data, path_categorization, lrCutoff, pCutoff, pCutoff2, direction, normality_test, top_genes, save=True, showplots=showplots)
    return deg_list


#depracated function (functionality now handled by neighbor_dependent_expression)
'''
def most_deg_neighbor_dependent(deg_list:pd.DataFrame, pval:float, fdr:float, lrCutoff:float, showplots:bool=True) -> pd.DataFrame:

    # get the genes and celltypes that match the inputted p-value and log ratio
    # Ensure columns are numeric
    deg_list['pvalue1'] = pd.to_numeric(deg_list['pvalue1'], errors='coerce')
    deg_list['fdr1'] = pd.to_numeric(deg_list['fdr1'], errors='coerce')
    deg_list['logRatio1'] = pd.to_numeric(deg_list['logRatio1'], errors='coerce')

    # Apply conditions with proper parentheses
    mask = (deg_list['pvalue1'] < pval) & (deg_list['fdr1'] < fdr) & (deg_list['logRatio1'] > lrCutoff)
    most_deg_list = deg_list[mask].copy()

    #volcano plots and heatmaps for interaction
    if showplots:
        for celltype in most_deg_list['celltype']:
            displayplots(celltype)

    return most_deg_list
'''


def get_full_gene_list(save_df:bool=True) -> pd.DataFrame:
    '''
    Generate the entire list of genes for every detected cell-cell interaction in the dataset, regardless of if it's differentially expressed or not.

    Parameters
    ----------
        **save_df** : *bool*
            The indicator to determine whether or not to save the list of all genes as a .csv file within the DE_results folder (is `True` by default).

    Returns
    -------
        **full_gene_list** : *pd.DataFrame*
            The dataframe containing all of the genes from every cell-cell interaction, along with the celltypes associated with them and their respective p-values, log ratios, and false discovery rates.
    '''

    # get the path for de_results
    de_results_path = os.path.join(os.getcwd(), f'DE_results/')

    # get the list of directory items
    dir_list = os.listdir(de_results_path)

    # initialize an empty dataframe to tack everything onto and return later
    full_gene_list = pd.DataFrame()

    # loop through them to get the list of genes from each subdirectory (making sure to skip anything that isn't a subdirectory)
    for item in dir_list:
        # get the subdirectory path
        current_dir = os.path.join(de_results_path, item)

        if os.path.isdir(current_dir):
            # get the current file
            current_file = pd.read_csv(f'{current_dir}/{item}_stat_total_genes.csv')

            # add the celltype column and make it the first column so the format is the same as deg_list
            current_file['celltype'] = item
            current_file = current_file[['celltype'] + [col for col in current_file.columns if col != 'celltype']]

            # tack it onto our dataframe
            full_gene_list = pd.concat([full_gene_list, current_file])

    if save_df:
        full_gene_list.to_csv(f'{de_results_path}/full_gene_list.csv', index=None)

    return full_gene_list


def cell_cell_degs(deg_list:pd.DataFrame, cell1:str, cell2:str, showplots:bool=True) -> pd.DataFrame:
    '''
    Get all of the differentially expressed genes for the given combination of cells.

    Parameters
    ----------
        **deg_list** : *pd.DataFrame*
            The dataframe containing all of the differentially expressed genes.
        **cell1** : *str*
            The first cell in the combination.
        **cell2** : *str*
            The second cell in the combination.
        **showplots** : *bool*
            The indicator to determine whether or not to show the volcano plot and heatmap for the celltype (is `True` by default).

    Returns
    -------
        **cell_cell_deg** : *pd.DataFrame*
            The dataframe containing all of the differentially expressed genes for the given combination
    '''

    # get the celltype column to use in the search
    celltype = f'{cell1}+{cell2}'

    # get the genes with that celltype
    cell_cell_deg = deg_list[deg_list['celltype'] == celltype]

    #volcano plots and heatmaps for interaction
    if showplots:
        # get the path for the plots
        path_deg = os.path.join(os.getcwd(), 'DE_results/')

        heatmap_path = os.path.join(path_deg, f'{celltype}/{celltype}_heatmap.pdf')
        volcano_path = os.path.join(path_deg, f'{celltype}/{celltype}_volcano.pdf')

        heatmap = pdf2image.convert_from_path(heatmap_path)[0]
        volcano = pdf2image.convert_from_path(volcano_path)[0]

        if volcano and heatmap:
            #display heatmap
            plt.figure(figsize=(12, 6))
            plt.imshow(np.array(heatmap))
            plt.axis('off')
            plt.grid(False)
            plt.show(block=False)

            #display volcano plot
            plt.figure(figsize=(12, 6))
            plt.imshow(np.array(volcano))
            plt.axis('off')
            plt.grid(False)
            plt.show(block=False)
        else:
            print(f'No plots found for {celltype}.')

    return cell_cell_deg


def is_gene_degs(deg_list:pd.DataFrame, gene:str, showplots:bool=True) -> pd.DataFrame:
    '''
    Get all of the cell-cell interaction combinations associated with a specific gene (inputted by the user).

    Parameters
    ----------
        **deg_list** : *pd.DataFrame*
            The dataframe containing all of the differentially expressed genes.
        **gene** : *str*
            The name of the chosen gene to search for in the dataframe.
        **showplots** : *bool*
            The indicator to determine whether or not to show the spatial plot for the gene (is `True` by default).

    Returns
    -------
        **cells4gene** : *pd.DataFrame*
            The dataframe containing all of the cell-cell interaction combinations associated with the specific gene.

    Notes
    -----
        The time it takes to make one spatial plot can be a bit lengthy, so if `cells4gene` ends up being a long list, be ready to potentially wait a while.
    '''

    # get the celltypes column for that gene
    cells4gene = deg_list[deg_list['DEG'] == gene]

    # spatial plot
    if showplots:
        # get the path for the plots and for the dataframe
        rootpath = os.getcwd()
        dataframe_path = os.path.join(rootpath, 'neighbor_info/df_processed.csv')
        dfp = pd.read_csv(dataframe_path)

        importlib.reload(cnex_vis)  # Reload the module to reflect changes
        for celltype in cells4gene['celltype']:
            print(f'Generating spatial plot for {celltype}...')
            cta, ctb = celltype.split('+')

            # load the data for each celltype
            path_selected = os.path.join(rootpath, f'DE_results/{cta}+{ctb}/')
            print(f'current path: {path_selected}')
            column_names = ['barcode', 'logdata', 'zscore']
            heterotypic = pd.read_csv(path_selected + f'{cta}+{ctb}_{gene}.txt', delimiter=",", names = column_names)
            homotypic1 = pd.read_csv(path_selected + f'{cta}+{cta}_{gene}.txt', delimiter=",", names = column_names)
            homotypic2 = pd.read_csv(path_selected + f'{ctb}+{ctb}_{gene}.txt', delimiter=",", names = column_names)
            heterotypic['type'] = f'{cta}+{ctb}'
            homotypic1['type'] = f'{cta}+{cta}'
            homotypic2['type'] = f'{ctb}+{ctb}'

            df_exp = pd.concat([heterotypic, homotypic1, homotypic2])

            # set the parameter values
            df_bg, df_red, df_blue, df_black = cnex_vis.set_parameters(dfp, df_exp, beadsize_bg=5, edgecolor_bg=(0.85,0.85,0.85), beadcolor_bg=(0.85,0.85,0.85), beadsize_red=60, beadsize_blue=30, beadsize_black=30, type_red=f'{cta}+{ctb}', type_blue=f'{cta}+{cta}', type_black=f'{ctb}+{ctb}')

            # Get the spatial map.
            # zorder_red, zorder_blue, and zorder_black are parameters that determine the drawing order in the spatial map.
            # If save=True, the spatial map is saved in the "spatialMap" folder in the root directory.
            cnex_vis.get_spatialPlot(df_bg, df_red, df_blue, df_black, label_red=f'{cta}+{ctb}', label_blue=f'{cta}+{cta}', label_black=f'{ctb}+{ctb}', label_gene=gene, zorder_red=2.0, zorder_blue=3.0, zorder_black=4.0, figsize=(20,28), save=True)

    return cells4gene


if __name__ == '__main__':
    print(os.getcwd())