Module cellex.summarydata

Expand source code
import os
import numpy as np
import pandas as pd

class SummaryData(object):
    """A class that contains summary data for computing ES metrics

    The SummaryData object allows sharing of summary data in between 
    commputations, thus reducing the number computations needed,
    e.g. we need only compute the mean once.

    Attributes
    ----------
    data: DataFrame
        Original expression data.

    annotation : 
        Annotation to group cells by.

    _mean : DataFrame
        Mean expression for the groups specified by the annotation.

    _n_cells_per_anno : DataFrame
        Number of cells per groups specified by the annotation.

    _n_nonzero : DataFrame
        Number of nonzero expression values per groups specified by the annotation.

    _variance : DataFrame
        Variance of expression values per groups specified by the annotation.

    annotation_null : np.ndarray
        Null annotation to group cells by.
    
    _mean_null : DataFrame
        Mean expression for the groups specified by the null annotation.

    _n_cells_per_anno_null : DataFrame
        Number of cells per groups specified by the null annotation.

    _n_nonzero_null : DataFrame
        Number of nonzero expression values per groups specified by the null annotation.

    _variance_null : DataFrame
        Variance of expression values per groups specified by the null annotation.

    Methods
    -------
    mean(self)
        Lazily evaluate and return _mean.
    
    n_cells_per_anno(self)
        Lazily evaluate and return _n_cells_per_anno.
    
    n_nonzero(self)
        Lazily evaluate and return _n_nonzero.
    
    variance(self)
        Lazily evaluate and return _variance.
    
    annotation_null(self)
        Lazily evaluate and return _annotation_null.
    
    mean_null(self)
        Lazily evaluate and return _mean_null.

    n_cells_per_anno_null(self)
        Lazily evaluate and return _n_cells_per_anno_null.
    
    n_nonzero_null(self)
        Lazily evaluate and return _n_nonzero_null.
    
    variance_null(self)
        Lazily evaluate and return _variance_null.

    save(dir_name: str=None, verbose: bool=False)
        Save the object attributes, excluding the original data.
    """

    def __init__(self, data: pd.DataFrame, annotation: np.array):
        """
        Parameters
        ----------
        data: DataFrame
            Original expression data.

        annotation : array
            Annotation to group cells by.
        """

        # df.columns = pd.MultiIndex.from_arrays([df.columns,
        #         df.columns.map(annotation, na_action="ignore").values.astype(str)],
        #         names=("id", "annotation"))
        
        self.data = data

        self.annotation = annotation
        self._mean = None
        self._n_cells_per_anno = None # tstat, ges (just one line)
        self._n_nonzero = None
        self._variance = None # tstat
        
        # null attributes
        self._annotation_null = None
        self._mean_null = None
        self._n_nonzero_null = None
        self._n_cells_per_anno_null = None
        self._variance_null = None

    @property
    def mean(self):
        """Compute or return mean"""
        if self._mean is None:
            self._mean = self.data.groupby(self.annotation, axis=1).mean()
        
        return self._mean

    @property
    def n_cells_per_anno(self):
        if self._n_cells_per_anno is None:
            self._n_cells_per_anno = self.data.groupby(self.annotation, axis=1).size()

        return self._n_cells_per_anno

    @property
    def n_nonzero(self):
        if self._n_nonzero is None:
            self._n_nonzero = self.data.groupby(self.annotation, axis=1).agg(lambda x: x.ne(0).sum(axis=1))
        
        return self._n_nonzero

    @property
    def variance(self):
        if self._variance is None:
            self._variance = self.data.groupby(self.annotation, axis=1).var()

        return self._variance

    ### Null summary data
    @property
    def annotation_null(self):
        if self._annotation_null is None:
            np.random.seed(1)
            self._annotation_null = np.random.permutation(self.annotation)

        return self._annotation_null

    @property
    def mean_null(self):
        if self._mean_null is None:
            self._mean_null = self.data.groupby(self.annotation_null, axis=1).mean()

        return self._mean_null
    
    @property
    def n_cells_per_anno_null(self):
        if self._n_cells_per_anno_null is None:
            self._n_cells_per_anno_null = self.data.groupby(self.annotation_null, axis=1).size()

        return self._n_cells_per_anno_null

    @property
    def n_nonzero_null(self):
        if self._n_nonzero_null is None:
            self._n_nonzero_null = self.data.groupby(self.annotation_null, axis=1).agg(lambda x: x.ne(0).sum(axis=1))
        
        return self._n_nonzero_null

    @property
    def variance_null(self):
        if self._variance_null is None:
            self._variance_null = self.data.groupby(self.annotation_null, axis=1).var()

        return self._variance_null

    def save(self, dir_name: str=None, verbose: bool=False) -> None:
        """
        Save summary statistics to disk.
        """
        if verbose:
            print("Saving results to disk ...")

        if dir_name == None:
            dir_name = "out"

        os.makedirs(dir_name, exist_ok=True) # make dir if it doesn't already exist

        ### Loop over SummaryStats attributes and save stats to disk
        for s in dir(self):
            att = getattr(self, s)
            if isinstance(att, pd.DataFrame) and s != "data":
                fp = "{}/summarystat.{}.csv.gz".format(dir_name, self.name, s)
                att.to_csv(fp, compression="gzip")
                if verbose:
                    print("  Saved: {}".format(fp))

            

Classes

class SummaryData (data: pandas.core.frame.DataFrame, annotation: )

A class that contains summary data for computing ES metrics

The SummaryData object allows sharing of summary data in between commputations, thus reducing the number computations needed, e.g. we need only compute the mean once.

Attributes

data : DataFrame
Original expression data.
annotation
Annotation to group cells by.
_mean : DataFrame
Mean expression for the groups specified by the annotation.
_n_cells_per_anno : DataFrame
Number of cells per groups specified by the annotation.
_n_nonzero : DataFrame
Number of nonzero expression values per groups specified by the annotation.
_variance : DataFrame
Variance of expression values per groups specified by the annotation.
annotation_null : np.ndarray
Null annotation to group cells by.
_mean_null : DataFrame
Mean expression for the groups specified by the null annotation.
_n_cells_per_anno_null : DataFrame
Number of cells per groups specified by the null annotation.
_n_nonzero_null : DataFrame
Number of nonzero expression values per groups specified by the null annotation.
_variance_null : DataFrame
Variance of expression values per groups specified by the null annotation.

Methods

mean(self) Lazily evaluate and return _mean.

n_cells_per_anno(self) Lazily evaluate and return _n_cells_per_anno.

n_nonzero(self) Lazily evaluate and return _n_nonzero.

variance(self) Lazily evaluate and return _variance.

annotation_null(self) Lazily evaluate and return _annotation_null.

mean_null(self) Lazily evaluate and return _mean_null.

n_cells_per_anno_null(self) Lazily evaluate and return _n_cells_per_anno_null.

n_nonzero_null(self) Lazily evaluate and return _n_nonzero_null.

variance_null(self) Lazily evaluate and return _variance_null.

save(dir_name: str=None, verbose: bool=False) Save the object attributes, excluding the original data.

Parameters

data : DataFrame
Original expression data.
annotation : array
Annotation to group cells by.
Expand source code
class SummaryData(object):
    """A class that contains summary data for computing ES metrics

    The SummaryData object allows sharing of summary data in between 
    commputations, thus reducing the number computations needed,
    e.g. we need only compute the mean once.

    Attributes
    ----------
    data: DataFrame
        Original expression data.

    annotation : 
        Annotation to group cells by.

    _mean : DataFrame
        Mean expression for the groups specified by the annotation.

    _n_cells_per_anno : DataFrame
        Number of cells per groups specified by the annotation.

    _n_nonzero : DataFrame
        Number of nonzero expression values per groups specified by the annotation.

    _variance : DataFrame
        Variance of expression values per groups specified by the annotation.

    annotation_null : np.ndarray
        Null annotation to group cells by.
    
    _mean_null : DataFrame
        Mean expression for the groups specified by the null annotation.

    _n_cells_per_anno_null : DataFrame
        Number of cells per groups specified by the null annotation.

    _n_nonzero_null : DataFrame
        Number of nonzero expression values per groups specified by the null annotation.

    _variance_null : DataFrame
        Variance of expression values per groups specified by the null annotation.

    Methods
    -------
    mean(self)
        Lazily evaluate and return _mean.
    
    n_cells_per_anno(self)
        Lazily evaluate and return _n_cells_per_anno.
    
    n_nonzero(self)
        Lazily evaluate and return _n_nonzero.
    
    variance(self)
        Lazily evaluate and return _variance.
    
    annotation_null(self)
        Lazily evaluate and return _annotation_null.
    
    mean_null(self)
        Lazily evaluate and return _mean_null.

    n_cells_per_anno_null(self)
        Lazily evaluate and return _n_cells_per_anno_null.
    
    n_nonzero_null(self)
        Lazily evaluate and return _n_nonzero_null.
    
    variance_null(self)
        Lazily evaluate and return _variance_null.

    save(dir_name: str=None, verbose: bool=False)
        Save the object attributes, excluding the original data.
    """

    def __init__(self, data: pd.DataFrame, annotation: np.array):
        """
        Parameters
        ----------
        data: DataFrame
            Original expression data.

        annotation : array
            Annotation to group cells by.
        """

        # df.columns = pd.MultiIndex.from_arrays([df.columns,
        #         df.columns.map(annotation, na_action="ignore").values.astype(str)],
        #         names=("id", "annotation"))
        
        self.data = data

        self.annotation = annotation
        self._mean = None
        self._n_cells_per_anno = None # tstat, ges (just one line)
        self._n_nonzero = None
        self._variance = None # tstat
        
        # null attributes
        self._annotation_null = None
        self._mean_null = None
        self._n_nonzero_null = None
        self._n_cells_per_anno_null = None
        self._variance_null = None

    @property
    def mean(self):
        """Compute or return mean"""
        if self._mean is None:
            self._mean = self.data.groupby(self.annotation, axis=1).mean()
        
        return self._mean

    @property
    def n_cells_per_anno(self):
        if self._n_cells_per_anno is None:
            self._n_cells_per_anno = self.data.groupby(self.annotation, axis=1).size()

        return self._n_cells_per_anno

    @property
    def n_nonzero(self):
        if self._n_nonzero is None:
            self._n_nonzero = self.data.groupby(self.annotation, axis=1).agg(lambda x: x.ne(0).sum(axis=1))
        
        return self._n_nonzero

    @property
    def variance(self):
        if self._variance is None:
            self._variance = self.data.groupby(self.annotation, axis=1).var()

        return self._variance

    ### Null summary data
    @property
    def annotation_null(self):
        if self._annotation_null is None:
            np.random.seed(1)
            self._annotation_null = np.random.permutation(self.annotation)

        return self._annotation_null

    @property
    def mean_null(self):
        if self._mean_null is None:
            self._mean_null = self.data.groupby(self.annotation_null, axis=1).mean()

        return self._mean_null
    
    @property
    def n_cells_per_anno_null(self):
        if self._n_cells_per_anno_null is None:
            self._n_cells_per_anno_null = self.data.groupby(self.annotation_null, axis=1).size()

        return self._n_cells_per_anno_null

    @property
    def n_nonzero_null(self):
        if self._n_nonzero_null is None:
            self._n_nonzero_null = self.data.groupby(self.annotation_null, axis=1).agg(lambda x: x.ne(0).sum(axis=1))
        
        return self._n_nonzero_null

    @property
    def variance_null(self):
        if self._variance_null is None:
            self._variance_null = self.data.groupby(self.annotation_null, axis=1).var()

        return self._variance_null

    def save(self, dir_name: str=None, verbose: bool=False) -> None:
        """
        Save summary statistics to disk.
        """
        if verbose:
            print("Saving results to disk ...")

        if dir_name == None:
            dir_name = "out"

        os.makedirs(dir_name, exist_ok=True) # make dir if it doesn't already exist

        ### Loop over SummaryStats attributes and save stats to disk
        for s in dir(self):
            att = getattr(self, s)
            if isinstance(att, pd.DataFrame) and s != "data":
                fp = "{}/summarystat.{}.csv.gz".format(dir_name, self.name, s)
                att.to_csv(fp, compression="gzip")
                if verbose:
                    print("  Saved: {}".format(fp))

Instance variables

var annotation_null
Expand source code
@property
def annotation_null(self):
    if self._annotation_null is None:
        np.random.seed(1)
        self._annotation_null = np.random.permutation(self.annotation)

    return self._annotation_null
var mean

Compute or return mean

Expand source code
@property
def mean(self):
    """Compute or return mean"""
    if self._mean is None:
        self._mean = self.data.groupby(self.annotation, axis=1).mean()
    
    return self._mean
var mean_null
Expand source code
@property
def mean_null(self):
    if self._mean_null is None:
        self._mean_null = self.data.groupby(self.annotation_null, axis=1).mean()

    return self._mean_null
var n_cells_per_anno
Expand source code
@property
def n_cells_per_anno(self):
    if self._n_cells_per_anno is None:
        self._n_cells_per_anno = self.data.groupby(self.annotation, axis=1).size()

    return self._n_cells_per_anno
var n_cells_per_anno_null
Expand source code
@property
def n_cells_per_anno_null(self):
    if self._n_cells_per_anno_null is None:
        self._n_cells_per_anno_null = self.data.groupby(self.annotation_null, axis=1).size()

    return self._n_cells_per_anno_null
var n_nonzero
Expand source code
@property
def n_nonzero(self):
    if self._n_nonzero is None:
        self._n_nonzero = self.data.groupby(self.annotation, axis=1).agg(lambda x: x.ne(0).sum(axis=1))
    
    return self._n_nonzero
var n_nonzero_null
Expand source code
@property
def n_nonzero_null(self):
    if self._n_nonzero_null is None:
        self._n_nonzero_null = self.data.groupby(self.annotation_null, axis=1).agg(lambda x: x.ne(0).sum(axis=1))
    
    return self._n_nonzero_null
var variance
Expand source code
@property
def variance(self):
    if self._variance is None:
        self._variance = self.data.groupby(self.annotation, axis=1).var()

    return self._variance
var variance_null
Expand source code
@property
def variance_null(self):
    if self._variance_null is None:
        self._variance_null = self.data.groupby(self.annotation_null, axis=1).var()

    return self._variance_null

Methods

def save(self, dir_name: str = None, verbose: bool = False) -> NoneType

Save summary statistics to disk.

Expand source code
def save(self, dir_name: str=None, verbose: bool=False) -> None:
    """
    Save summary statistics to disk.
    """
    if verbose:
        print("Saving results to disk ...")

    if dir_name == None:
        dir_name = "out"

    os.makedirs(dir_name, exist_ok=True) # make dir if it doesn't already exist

    ### Loop over SummaryStats attributes and save stats to disk
    for s in dir(self):
        att = getattr(self, s)
        if isinstance(att, pd.DataFrame) and s != "data":
            fp = "{}/summarystat.{}.csv.gz".format(dir_name, self.name, s)
            att.to_csv(fp, compression="gzip")
            if verbose:
                print("  Saved: {}".format(fp))