Module cellex.metrics.nsi
Expand source code
import multiprocessing as mp
import numpy as np
import pandas as pd
import scipy as sp
import time
import datetime
from scipy import stats
from .esw_star import esw_star
from ..utils.compute_pvalues import compute_pvalues
from ..summarydata import SummaryData
def _nsi(mean: pd.DataFrame, verbose: bool=False):
"""Computes Normalized Specificity Index ES weights for each gene / cell-type
Parameters
----------
mean : DataFrame
Mean expression per gene / annotation group.
verbose : bool, optional (default: False)
Print progress report.
Returns
-------
result : ndarray
ES weights
TODO:
Filter data
"""
eps = 0.000000000001
# number of genes, cells / columns, unique clusters
n_genes = mean.shape[0]
n_cells = mean.shape[1]
# Compute SI
# Create output matrice for results
result = np.zeros(shape=(n_genes, n_cells))
view = mean.values + eps
selection = np.arange(n_cells)
### Compute nsi per column
for j in range(n_cells): # cycle cells
# Get indexes of other cells
other_cells = np.delete(selection, j)
# compute fold change for cell j.
# N.B. epsilon is added before loop.
fc = (view[:,[j]]/view[:,other_cells])
# filter fc values close to 0 and 1, since these are
# assumed to be artefacts of adding epsilon
eps_machine = np.finfo(float).eps
tol = eps_machine**0.5
fc[np.isclose(fc, 0, rtol=0, atol=tol)] = 0.
fc[np.isclose(fc, 1, rtol=0, atol=tol)] = 0.
# compute gene rank for each fold change result, i.e. rank of gene expr per cell
# we transspose the matrix once, so that we can iterate over cols as rows.
# we transpose the matrix once more to get back the original cols.
fc_ranked = np.array([sp.stats.rankdata(col, method="min") for col in fc.T]).T
# normalize: divide each column by number of genes. Subtract 1 in denominator and numerator
fc_ranked_norm = (fc_ranked - 1) / (fc_ranked.shape[0] - 1) # i.e. rank / n_rows
# compute average rank based on fold change ranks, i.e. compute mean per row / gene
fc_mean = fc_ranked_norm.mean(axis=1)
# store gene_rank results (a column) for cell j
result[:,j] = fc_mean
return result
def nsi(stats: SummaryData, verbose: bool=False, compute_meta: bool=False):
"""Compute Normalized Specificity Index
NSI is based on Specificity Index described in:
Dougherty, et al. Analytical approaches to RNA profiling data for the
identification of genes enriched in specific cells. Nucleic Acids Res.
38, 4218–4230 (2010).
and implemented in R. Code available at:
bactrap(.)org
Parameters
----------
summarydata : SummaryData
Summary data computed from raw data using specified annotation.
verbose : bool, optional (default: False)
Print progress report.
compute_meta : bool, optional (default: False)
Compute meta results.
Returns
-------
results : dict
Dictionary containing all computed ESw and meta results, e.g. pvals
"""
start = 0
if verbose:
start = time.time()
print("Computing NSI ...")
df = stats.mean
idx_labels = df.index
col_labels = stats.mean.columns.values
key = "nsi."
results = {}
if verbose:
print(" esw ...")
esw = _nsi(df, verbose)
esw_df = pd.DataFrame(esw, idx_labels, col_labels)
results[(key + "esw")] = esw_df
if compute_meta:
esw_null = _nsi(stats.mean_null, verbose)
pvals = compute_pvalues(esw, esw_null, verbose)
pvals_df = pd.DataFrame(pvals, idx_labels, col_labels)
results[(key + "esw_null")] = pd.DataFrame(esw_null, idx_labels, col_labels)
results[(key + "pvals")] = pvals_df
results[(key + "esw_s")] = esw_star(esw_df, pvals_df, verbose)
if verbose:
td = datetime.timedelta(seconds=(time.time() - start))
print(" finished in %d min %d sec" % (divmod(td.seconds, 60)))
return results
Functions
def nsi(stats: SummaryData, verbose: bool = False, compute_meta: bool = False)
-
Compute Normalized Specificity Index
NSI is based on Specificity Index described in:
Dougherty, et al. Analytical approaches to RNA profiling data for the identification of genes enriched in specific cells. Nucleic Acids Res. 38, 4218–4230 (2010).
and implemented in R. Code available at:
bactrap(.)org
Parameters
summarydata
:SummaryData
- Summary data computed from raw data using specified annotation.
verbose
:bool
, optional(default: False)
- Print progress report.
compute_meta
:bool
, optional(default: False)
- Compute meta results.
Returns
results
:dict
- Dictionary containing all computed ESw and meta results, e.g. pvals
Expand source code
def nsi(stats: SummaryData, verbose: bool=False, compute_meta: bool=False): """Compute Normalized Specificity Index NSI is based on Specificity Index described in: Dougherty, et al. Analytical approaches to RNA profiling data for the identification of genes enriched in specific cells. Nucleic Acids Res. 38, 4218–4230 (2010). and implemented in R. Code available at: bactrap(.)org Parameters ---------- summarydata : SummaryData Summary data computed from raw data using specified annotation. verbose : bool, optional (default: False) Print progress report. compute_meta : bool, optional (default: False) Compute meta results. Returns ------- results : dict Dictionary containing all computed ESw and meta results, e.g. pvals """ start = 0 if verbose: start = time.time() print("Computing NSI ...") df = stats.mean idx_labels = df.index col_labels = stats.mean.columns.values key = "nsi." results = {} if verbose: print(" esw ...") esw = _nsi(df, verbose) esw_df = pd.DataFrame(esw, idx_labels, col_labels) results[(key + "esw")] = esw_df if compute_meta: esw_null = _nsi(stats.mean_null, verbose) pvals = compute_pvalues(esw, esw_null, verbose) pvals_df = pd.DataFrame(pvals, idx_labels, col_labels) results[(key + "esw_null")] = pd.DataFrame(esw_null, idx_labels, col_labels) results[(key + "pvals")] = pvals_df results[(key + "esw_s")] = esw_star(esw_df, pvals_df, verbose) if verbose: td = datetime.timedelta(seconds=(time.time() - start)) print(" finished in %d min %d sec" % (divmod(td.seconds, 60))) return results