Module `cellex.preprocessing.anova`

Expand source code

import numpy as np
import pandas as pd
import time
import datetime
from scipy import stats

def anova(df: pd.DataFrame, annotation: np.ndarray, threshold: np.float=0.00001, verbose: bool=False):
    """Apply ANOVA and filter data
    Applies ANOVA to identify lowly / sporadically expressed genes and remove them.
    Values of each gene (row) is split into groups according to the annotation.
    ANOVA is then computed for these groups to get a p-value for this gene.
    The p-value is used to filter out genes with insignificant differences in variance.
    
    Parameters
    ----------
    df : DataFrame
        Log normalized expression data.
    
    annotation : ndarray
        Annotation of cells, e.g. cell-type or class.
    
    threshold : float, optional (default: 0.00001)
        P-value cutoff.

    verbose : bool, optional (default: False)
        Print progress report.
    
    Returns
    -------
    dict
        "anova" : df_anova    : DataFrame of F-values and p-values.
        "df"    : df_filtered : DataFrame of genes with pval < threshold.

    NOTE:
    * df format – rows: genes, cols: cells.
              A     B     C
        POMC  0.0   0.5   0.9
        AGRP  0.2   0.0   0.0
        LEPR  0.1   0.1   0.4

    * annotation format – 1 by n, i.e. one row.
        ["ABC", "ABC", "ABC", "ACBG", "ACBG", ..., "ACMB"]

    TODO:
    * refine grouping. Using np.unique seems unsafe and complex
    * consider using generator instead of list comprehension to save memory and time
    * consider alternative masking/filtering. Is numpy the fastest solution?
    """
    start = 0

    if verbose:
        start = time.time()
        print("Preprocessing - running ANOVA ... ", end='')

    ### Split annotation into groups
    # we use pandas.unique, as it is faster than numpy for big arrays
    # furthermore, it does not sort the groups and thus the original
    # order of indexes is maintained, improving locality
    idx = [np.where(annotation == i)[0] for i in pd.unique(annotation)]
    
    ### Iterate over genes and compute ANOVA for each gene
    def _anova_(row, groups):
        """Apply ANOVA to a vector
        Split the annotation by 
        """
        fval, pval = stats.f_oneway(*[row[g] for g in groups])
        return np.array([fval, pval])

    ### Create dataframe for ANOVA results
    f = np.apply_along_axis(func1d=_anova_, axis=1, arr=df.values, groups=(idx))

    df_anova = pd.DataFrame({"statistic": f[:,0], 
                         "pvalue": f[:,1]}, 
                        index=df.index)

    ### Filter original dataframe
    # Get row-index of genes where pvalue < threshold
    idx_thresh = (df_anova.values[:,1] < threshold)
    df_filtered = df.iloc[idx_thresh]

    if verbose:
        n_genes_org = len(df)
        n_genes_filtered = len(df_filtered)
        td = datetime.timedelta(seconds=(time.time() - start))
        print("excluded %d / %d genes in %d min %d sec" % ((n_genes_org - n_genes_filtered), n_genes_org, *divmod(td.seconds, 60)))
    
    return {'anova': df_anova, 'df': df_filtered}

Functions

def anova(df: pandas.core.frame.DataFrame, annotation: numpy.ndarray, threshold: float = 1e-05, verbose: bool = False)

Apply ANOVA and filter data Applies ANOVA to identify lowly / sporadically expressed genes and remove them. Values of each gene (row) is split into groups according to the annotation. ANOVA is then computed for these groups to get a p-value for this gene. The p-value is used to filter out genes with insignificant differences in variance.

Parameters

df : DataFrame: Log normalized expression data.
annotation : ndarray: Annotation of cells, e.g. cell-type or class.
threshold : float, optional (default: 0.00001): P-value cutoff.
verbose : bool, optional (default: False): Print progress report.

Returns

dict: "anova" : df_anova : DataFrame of F-values and p-values. "df" : df_filtered : DataFrame of genes with pval < threshold.
NOTE:

df format – rows: genes, cols: cells. A B C POMC 0.0 0.5 0.9 AGRP 0.2 0.0 0.0 LEPR 0.1 0.1 0.4
annotation format – 1 by n, i.e. one row. ["ABC", "ABC", "ABC", "ACBG", "ACBG", …, "ACMB"]

TODO:

refine grouping. Using np.unique seems unsafe and complex
consider using generator instead of list comprehension to save memory and time
consider alternative masking/filtering. Is numpy the fastest solution?

Expand source code

def anova(df: pd.DataFrame, annotation: np.ndarray, threshold: np.float=0.00001, verbose: bool=False):
    """Apply ANOVA and filter data
    Applies ANOVA to identify lowly / sporadically expressed genes and remove them.
    Values of each gene (row) is split into groups according to the annotation.
    ANOVA is then computed for these groups to get a p-value for this gene.
    The p-value is used to filter out genes with insignificant differences in variance.
    
    Parameters
    ----------
    df : DataFrame
        Log normalized expression data.
    
    annotation : ndarray
        Annotation of cells, e.g. cell-type or class.
    
    threshold : float, optional (default: 0.00001)
        P-value cutoff.

    verbose : bool, optional (default: False)
        Print progress report.
    
    Returns
    -------
    dict
        "anova" : df_anova    : DataFrame of F-values and p-values.
        "df"    : df_filtered : DataFrame of genes with pval < threshold.

    NOTE:
    * df format – rows: genes, cols: cells.
              A     B     C
        POMC  0.0   0.5   0.9
        AGRP  0.2   0.0   0.0
        LEPR  0.1   0.1   0.4

    * annotation format – 1 by n, i.e. one row.
        ["ABC", "ABC", "ABC", "ACBG", "ACBG", ..., "ACMB"]

    TODO:
    * refine grouping. Using np.unique seems unsafe and complex
    * consider using generator instead of list comprehension to save memory and time
    * consider alternative masking/filtering. Is numpy the fastest solution?
    """
    start = 0

    if verbose:
        start = time.time()
        print("Preprocessing - running ANOVA ... ", end='')

    ### Split annotation into groups
    # we use pandas.unique, as it is faster than numpy for big arrays
    # furthermore, it does not sort the groups and thus the original
    # order of indexes is maintained, improving locality
    idx = [np.where(annotation == i)[0] for i in pd.unique(annotation)]
    
    ### Iterate over genes and compute ANOVA for each gene
    def _anova_(row, groups):
        """Apply ANOVA to a vector
        Split the annotation by 
        """
        fval, pval = stats.f_oneway(*[row[g] for g in groups])
        return np.array([fval, pval])

    ### Create dataframe for ANOVA results
    f = np.apply_along_axis(func1d=_anova_, axis=1, arr=df.values, groups=(idx))

    df_anova = pd.DataFrame({"statistic": f[:,0], 
                         "pvalue": f[:,1]}, 
                        index=df.index)

    ### Filter original dataframe
    # Get row-index of genes where pvalue < threshold
    idx_thresh = (df_anova.values[:,1] < threshold)
    df_filtered = df.iloc[idx_thresh]

    if verbose:
        n_genes_org = len(df)
        n_genes_filtered = len(df_filtered)
        td = datetime.timedelta(seconds=(time.time() - start))
        print("excluded %d / %d genes in %d min %d sec" % ((n_genes_org - n_genes_filtered), n_genes_org, *divmod(td.seconds, 60)))
    
    return {'anova': df_anova, 'df': df_filtered}