Module cellex.metrics.det
Expand source code
import numpy as np
import pandas as pd
import time
import datetime
from .esw_star import esw_star
from ..utils.compute_pvalues import compute_pvalues
from ..summarydata import SummaryData
def _det(mean: pd.DataFrame, var: pd.DataFrame, n_cells: pd.DataFrame, verbose: bool=False):
"""Computes Differential Expression T-statistic ES weights for each gene / cell-type
Parameters
----------
mean : DataFrame
Mean expression per gene / annotation group.
var : DataFrame
Expression variance per gene / annotation group.
n_cells : DataFrame
Number of cells per annotation group.
verbose : bool, optional (default: False)
Print progress report.
Returns
-------
result : ndarray
ES weights
TODO:
* Consider replacing args with SummaryStats. Probably not feasible,
since only tstat uses var and ncells.
* Consider the consequences of using fillna
* I think some details are missing in the computations,
e.g. subtracting 1 from denominator in var_pooled
* The way s_p is computed seems off
"""
### Compute pooled variance
variance = var
variance.fillna(0, inplace=True)
#n_cells = df.groupby(grouping, axis=1).count() # assuming this is an n_genes x n_annotations
n_cells = np.array([n_cells.values] * mean.shape[0]) # faster than count
# n_cells_sum = np.sum(n_cells, axis=1)
# s_p^2 = sum((sample_size_i - 1) * variance_i) / sum(sample_size_i - 1)
sd_pooled = np.sqrt(np.sum(((n_cells - 1) * variance.values[:,:]), axis=1) / (np.sum(n_cells - 1, axis=1)))
### Compute tstat per column
# should be possible to vectorize using helper function
# some of these things could be precomputed for the whole matrix
result = np.zeros(shape=(mean.shape[0], mean.shape[1])) # n_genes, n_cells
for col in mean:
### Compute X_1 and X_2
# Separate X and X_other. Use mean values.
# X is already mean
# X_other is processed: (mean_other * n_cells_other) / n_cells_sum
i = mean.columns.get_loc(col)
X_1 = mean.values[:,i]
mean_others = np.delete(mean.values[:,:], i, axis=1)
n_cells_other = np.delete(n_cells, i, axis=1)
X_others = np.sum((mean_others * n_cells_other), axis=1) / (np.sum(n_cells, axis=1))
### Compute n_1 and n_other
# n_1 is n_cells[1]
# n_other is sum(n_cells[other])
n_1 = n_cells[:,i]
n_other = np.sum(n_cells_other, axis=1)
### Compute scaling factor
sd_pooled_factor = np.sqrt((1/n_1) + (1/n_other))
### Compute tstat and save result
tstat = (X_1 - X_others) / (sd_pooled_factor * sd_pooled)
result[:,i] = tstat
return result
def det(stats: SummaryData, verbose: bool=False, compute_meta: bool=False):
"""Compute Differential Expression T-statistic
Parameters
----------
summarydata : SummaryData
Summary data computed from raw data using specified annotation.
verbose : bool, optional (default: False)
Print progress report.
compute_meta : bool, optional (default: False)
Compute meta results.
Returns
-------
results : dict
Dictionary containing all computed ESw and meta results, e.g. pvals
"""
start = 0
if verbose:
start = time.time()
print("Computing DET ... ")
idx_labels = stats.data.index
col_labels = stats.mean.columns.values
key = "det."
results = {}
if verbose:
print(" esw ...")
esw = _det(stats.mean, stats.variance, stats.n_cells_per_anno, verbose)
esw_df = pd.DataFrame(esw, idx_labels, col_labels)
results[(key + "esw")] = esw_df
if compute_meta:
esw_null = _det(stats.mean_null, stats.variance_null, stats.n_cells_per_anno_null, verbose)
pvals = compute_pvalues(esw, esw_null, verbose)
pvals_df = pd.DataFrame(pvals, idx_labels, col_labels)
results[(key + "esw_null")] = pd.DataFrame(esw_null, idx_labels, col_labels)
results[(key + "pvals")] = pvals_df
results[(key + "esw_s")] = esw_star(esw_df, pvals_df, verbose)
if verbose:
td = datetime.timedelta(seconds=(time.time() - start))
print(" finished in %d min %d sec" % (divmod(td.seconds, 60)))
return results
Functions
def det(stats: SummaryData, verbose: bool = False, compute_meta: bool = False)
-
Compute Differential Expression T-statistic
Parameters
summarydata
:SummaryData
- Summary data computed from raw data using specified annotation.
verbose
:bool
, optional(default: False)
- Print progress report.
compute_meta
:bool
, optional(default: False)
- Compute meta results.
Returns
results
:dict
- Dictionary containing all computed ESw and meta results, e.g. pvals
Expand source code
def det(stats: SummaryData, verbose: bool=False, compute_meta: bool=False): """Compute Differential Expression T-statistic Parameters ---------- summarydata : SummaryData Summary data computed from raw data using specified annotation. verbose : bool, optional (default: False) Print progress report. compute_meta : bool, optional (default: False) Compute meta results. Returns ------- results : dict Dictionary containing all computed ESw and meta results, e.g. pvals """ start = 0 if verbose: start = time.time() print("Computing DET ... ") idx_labels = stats.data.index col_labels = stats.mean.columns.values key = "det." results = {} if verbose: print(" esw ...") esw = _det(stats.mean, stats.variance, stats.n_cells_per_anno, verbose) esw_df = pd.DataFrame(esw, idx_labels, col_labels) results[(key + "esw")] = esw_df if compute_meta: esw_null = _det(stats.mean_null, stats.variance_null, stats.n_cells_per_anno_null, verbose) pvals = compute_pvalues(esw, esw_null, verbose) pvals_df = pd.DataFrame(pvals, idx_labels, col_labels) results[(key + "esw_null")] = pd.DataFrame(esw_null, idx_labels, col_labels) results[(key + "pvals")] = pvals_df results[(key + "esw_s")] = esw_star(esw_df, pvals_df, verbose) if verbose: td = datetime.timedelta(seconds=(time.time() - start)) print(" finished in %d min %d sec" % (divmod(td.seconds, 60))) return results