Module `cellex.metrics.ges`

Expand source code

import numpy as np
import pandas as pd
import time
import datetime
from .esw_star import esw_star
from ..utils.compute_pvalues import compute_pvalues
from ..summarydata import SummaryData

def _ges(data: pd.DataFrame, mean: pd.DataFrame, nnz: pd.DataFrame, n_cells_per_anno: pd.DataFrame, verbose: bool=False) -> pd.DataFrame:
    """Computes Gene Enrichment Score ES weights for each gene / cell-type

    Parameters
    ----------
    data : DataFrame
        Expression values per gene / cell.

    mean : DataFrame
        Mean expression per gene / annotation group.

    nnz : DataFrame
        Number of nonzero expression values per gene / annotation group.

    n_cells_per_anno : DataFrame
        Number of cells per annotation group.

    verbose : bool, optional (default: False)
        Print progress report.

    Returns
    -------
    enrichment : ndarray
        ES weights
    
    TODO:
        Filter data
        Fix weird computations:
        * np.allclose(np.sum(df.values[:,:],axis=1), (mean_overall*n_cells_total)) >>> True
        * np.allclose(((nnz_overall / n_cells_total) * n_cells_total), nnz_overall) >>> True
    """
    
    # Compute nnz and mean
    mean = mean.values
    nnz = nnz.values

    # number of cells per cluster
    cluster_sizes = n_cells_per_anno.values

    # Non-zeros and mean over all cells
    nnz_overall = np.count_nonzero(data.values,axis=1)
    mean_overall = np.mean(data.values,axis=1)

    # number of cells / columns
    n_cells_total = data.shape[1]

    # Compute fraction of non_zero in clusters and overall
    f_nnz = nnz / cluster_sizes
    f_nnz_overall = nnz_overall / n_cells_total

    # Means and fraction non-zero values in other clusters (per cluster)
    # "((mean_overall * n_cells_total)[None].T - (mean * cluster_sizes))" => means for all genes scaled by n_cells_total minus
    # mean for each cluster scaled by cluster size returns mean of cells outside cluster i.
    # "(n_cells_total - cluster_sizes)" => an array of cell count outside cluster i.
    mean_other = ((mean_overall * n_cells_total)[None].T - (mean * cluster_sizes)) / (n_cells_total - cluster_sizes)
    f_nnz_other = ((f_nnz_overall * n_cells_total)[None].T - (f_nnz * cluster_sizes)) / (n_cells_total - cluster_sizes)

    e1 = 0.01 # org: 0.1
    e2 = 1e-100 # org: 0.01

    enrichment = ((f_nnz + e1) / (f_nnz_other + e1)) * ((mean + e2) / (mean_other + e2))

    # filter values close to 0
    eps_machine = np.finfo(float).eps
    tol = eps_machine**0.5
    enrichment[np.isclose(enrichment, 0, rtol=0, atol=tol)] = 0.

    return enrichment

def ges(stats: SummaryData, verbose: bool=False, compute_meta: bool=False):
    """Compute Gene Enrichment Score

    GES is based on the eponymous metric described in:

        Zeisel, et al. Molecular Architecture of the Mouse Nervous System. 
        Cell 174, 999- 1014.e22 (2018).

    and implemented as MarkerSelection in Cytograph. Code available at:

        github(.)com/linnarsson-lab/cytograph


    Parameters
    ----------
    summarydata : SummaryData
        Summary data computed from raw data using specified annotation.

    verbose : bool, optional (default: False)
        Print progress report.
    
    compute_meta : bool, optional (default: False)
        Compute meta results.

    Returns
    -------
    results : dict
        Dictionary containing all computed ESw and meta results, e.g. pvals
    
    """
    
    start = 0

    if verbose:
        start = time.time()
        print("Computing GES ...")

    # df = stats.data
    idx_labels = stats.data.index
    col_labels = stats.mean.columns.values
    key = "ges."

    results = {}

    if verbose:
        print("    esw ...")
    esw = _ges(stats.data, stats.mean, stats.n_nonzero, stats.n_cells_per_anno, verbose)
    esw_df = pd.DataFrame(data=esw, index=idx_labels, columns=col_labels)
    results[(key + "esw")] = esw_df

    if compute_meta:
        esw_null = _ges(stats.data, stats.mean_null, stats.n_nonzero_null, stats.n_cells_per_anno_null, verbose=verbose)
        pvals = compute_pvalues(esw, esw_null, verbose)
        pvals_df = pd.DataFrame(pvals, idx_labels, col_labels)
        results[(key + "esw_null")] = pd.DataFrame(esw_null, idx_labels, col_labels)
        results[(key + "pvals")] = pvals_df
        results[(key + "esw_s")] = esw_star(esw_df, pvals_df, verbose)

    if verbose:
        td = datetime.timedelta(seconds=(time.time() - start))
        print("    finished in %d min %d sec" % (divmod(td.seconds, 60)))

    return results

Functions

def ges(stats: SummaryData, verbose: bool = False, compute_meta: bool = False)

Compute Gene Enrichment Score

GES is based on the eponymous metric described in:

Zeisel, et al. Molecular Architecture of the Mouse Nervous System. 
Cell 174, 999- 1014.e22 (2018).

and implemented as MarkerSelection in Cytograph. Code available at:

github(.)com/linnarsson-lab/cytograph

Parameters

summarydata : SummaryData: Summary data computed from raw data using specified annotation.
verbose : bool, optional (default: False): Print progress report.
compute_meta : bool, optional (default: False): Compute meta results.

Returns

results : dict: Dictionary containing all computed ESw and meta results, e.g. pvals

Expand source code

def ges(stats: SummaryData, verbose: bool=False, compute_meta: bool=False):
    """Compute Gene Enrichment Score

    GES is based on the eponymous metric described in:

        Zeisel, et al. Molecular Architecture of the Mouse Nervous System. 
        Cell 174, 999- 1014.e22 (2018).

    and implemented as MarkerSelection in Cytograph. Code available at:

        github(.)com/linnarsson-lab/cytograph


    Parameters
    ----------
    summarydata : SummaryData
        Summary data computed from raw data using specified annotation.

    verbose : bool, optional (default: False)
        Print progress report.
    
    compute_meta : bool, optional (default: False)
        Compute meta results.

    Returns
    -------
    results : dict
        Dictionary containing all computed ESw and meta results, e.g. pvals
    
    """
    
    start = 0

    if verbose:
        start = time.time()
        print("Computing GES ...")

    # df = stats.data
    idx_labels = stats.data.index
    col_labels = stats.mean.columns.values
    key = "ges."

    results = {}

    if verbose:
        print("    esw ...")
    esw = _ges(stats.data, stats.mean, stats.n_nonzero, stats.n_cells_per_anno, verbose)
    esw_df = pd.DataFrame(data=esw, index=idx_labels, columns=col_labels)
    results[(key + "esw")] = esw_df

    if compute_meta:
        esw_null = _ges(stats.data, stats.mean_null, stats.n_nonzero_null, stats.n_cells_per_anno_null, verbose=verbose)
        pvals = compute_pvalues(esw, esw_null, verbose)
        pvals_df = pd.DataFrame(pvals, idx_labels, col_labels)
        results[(key + "esw_null")] = pd.DataFrame(esw_null, idx_labels, col_labels)
        results[(key + "pvals")] = pvals_df
        results[(key + "esw_s")] = esw_star(esw_df, pvals_df, verbose)

    if verbose:
        td = datetime.timedelta(seconds=(time.time() - start))
        print("    finished in %d min %d sec" % (divmod(td.seconds, 60)))

    return results