Module cellex.esobject

Expand source code
import h5py
import numpy as np
import os
import pandas as pd
import sys
from .summarydata import SummaryData
from . import preprocessing
from . import metrics
from . import utils
from cellex import ES_METRICS


class ESObject(object):
    """A class that integrates the CELLEX workflow to compute ES

    Workflow: filter non-expressed, normalize, filter non-varying, compute
    ESw, compute p-values, compute ESw*, compute ESmu and ESsd


    Attributes
    ----------
    results : dict
    summary_data : SummaryData

    Methods
    -------
    compute(self, esms: list=None, verbose: bool=False, compute_meta: bool=True) -> None:
    save_as_csv(self, file_prefix: str=None, path: str=None, keys: list=None, verbose: bool=False) -> None:
    save_as_hdf(self, filename: str, path: str=None, keys: list=None, verbose: bool=False) -> None:
    """
    
    def __init__(self, 
                data: pd.DataFrame, 
                annotation: pd.Series,
                dtype: str="float32",
                remove_non_expressed: bool=True,
                normalize: bool=True,
                anova: bool=True,
                verbose: bool=False
                ):
        """
        Parameters
        ----------
        data: DataFrame
            Original expression data.
        
        annotation : Series
            Annotation to group cells by.
        
        dtype : str, optional (default: "float32)
            Datatype to use.
        
        remove_non_expressed : bool, optional (default: True)
            Remove non-expressed genes from dataframe.
        
        normalize : bool, optional (default: True)
            Normalize data.
        
        anova : bool, optional (default: True)
            Perform anova to remove genes with low variance.
        """
        
        ### Preprocessing steps
        # default: cast input datatype to type with smaller memory footprint
        data = data.astype(dtype=dtype)

        # parse data and metadata, i.e. run various quality checks
        data, annotation = utils.parse_input(data, annotation, verbose)
        
        if remove_non_expressed:
            data = preprocessing.remove_non_expressed(df=data, verbose=verbose)
        
        if normalize:
            data = preprocessing.log_normalize(df=data, verbose=verbose)
        
        if anova:
            # anova returns dict of two dataframes. Select the filtered "df"
            data = preprocessing.anova(df=data, annotation=annotation, verbose=verbose)["df"]

        self.results = {}

        ### Create SummaryData object
        self.summary_data = SummaryData(data, annotation)
    
    def compute(self,
                esms: list=None, \
                verbose: bool=False, \
                compute_meta: bool=True) -> None:
        """Compute ESw using specified ES metrics using object annotation.

        Results are stored in a dictionary with key:
            <esm>.<esw_type>

        Parameters
        ----------
        esms : list(str), optional (default: None)
            List of ES metrics to compute
        verbose : bool, optional (default: False)
            Print progress report.
        compute_meta: bool, optional (default: True)
            Compute ESmu, ESsd, and prerequisites.

        Returns
        -------
            None


        """
        
        if esms is None:
            esms = ES_METRICS

        for m in esms:
            if (m.lower() not in ES_METRICS):
                raise ValueError("No such metric: ", m.lower())

        results = {}

        for m in esms:
            esm_result = getattr(metrics, m.lower())(self.summary_data, verbose, compute_meta)
            results.update(esm_result)

        if compute_meta:
            esws = [val for key,val in results.items() if ("esw_s" in key)]
            results["esmu"] = metrics.es_mu(esws, verbose)
            results["essd"] = metrics.es_sd(esws, verbose)

        if verbose:
            print("Computed %a." % list(results.keys()))

        self.results.update(results)

        return None

    def save_as_csv(self, file_prefix: str=None, path: str=None, keys: list=None, verbose: bool=False) -> None:
        """Save results as multiple csv files

        Saves all results in self.results to a directory: /out_###
        Results include esw, esw_null, pvals, qvals and ESmu.

        Parameters
        ----------
        file_prefix : str, optional (default: None)
            Prefix to append to filenames, i.e. 
            <prefix>.<metric>.<item>.csv.gz

        path : str, optional (default: None)
            Path to save to. If None, saves to "out".

        keys : list, optional (default: all keys in self.results)
            Keys of results in self.results dictionary.
            May be used for saving only specific results.

        verbose : bool, optional (default: False)
            Print progress report.

        Returns
        -------
        None
        """
        if verbose:
            print("Saving results as csv to disk ...")
        
        if keys is None:
            keys = ["esmu", "essd"]

        if "all" in keys:
            keys = self.results.keys()

        if path is None:
            path = "out"

        if file_prefix is None:
            file_prefix = ""
        else:
            file_prefix = "{}.".format(file_prefix)

        os.makedirs(path, exist_ok=True) # make dir if it doesn't already exist

        ### Save results
        for k in keys:
            try:
                df = self.results[k]
                fp = "{}/{}{}.csv.gz".format(path, file_prefix, k)
                df.to_csv(fp, compression="gzip")
                if verbose:
                    print("  Saved: {}".format(fp))
            except KeyError:
                print("  WARNING: Key \"{}\" does not exist in ESObject results. No data saved.".format(k))

        if verbose:
            print("Finished saving results to {}".format(path))
    

    def save_as_hdf(self, filename: str, path: str=None, keys: list=None, verbose: bool=False) -> None:
        """Save results to a single hdf file

        Parameters
        ----------
        filename : str
            Filename to write to.

        path : str, optional (default: None)
            Path to save to. If None, saves to "out".

        keys : list, optional (default: all keys in self.results)
            Keys of results in self.results dictionary.
            May be used for saving only specific results.

        verbose : bool, optional (default: False)
            Print progress report.

        Returns
        -------
        None

        """
        if verbose:
            print("Saving results as hdf to disk ...")
        
        if keys is None:
            keys = self.results.keys()
        
        assert (len(keys) > 0), "No results to save in ESObject.results."

        if path is None:
            path = "out"
        
        filename = "{}/{}.h5".format(path, filename)

        os.makedirs(path, exist_ok=True) # make dir if it doesn't already exist

        with h5py.File(filename, "w-") as f:
            axis0 = self.summary_data.mean.columns.values.astype("S")
            axis1 = self.summary_data.mean.index.values.astype("S")
            f["metadata/axis0"] = axis0
            f["metadata/axis1"] = axis1

            if verbose:
                print("  Saved: metadata/axis0")
                print("  Saved: metadata/axis0")

            for k in keys:
                f["data/{}".format(k)] = self.results[k].values
                if verbose:
                    print("  Saved: data/{}".format(k))

            f.flush()

        if verbose:
            print("Finished saving results to {}".format(filename))

Classes

class ESObject (data: pandas.core.frame.DataFrame, annotation: pandas.core.series.Series, dtype: str = 'float32', remove_non_expressed: bool = True, normalize: bool = True, anova: bool = True, verbose: bool = False)

A class that integrates the CELLEX workflow to compute ES

Workflow: filter non-expressed, normalize, filter non-varying, compute ESw, compute p-values, compute ESw*, compute ESmu and ESsd

Attributes

results : dict
 
summary_data : SummaryData
 

Methods

compute(self, esms: list=None, verbose: bool=False, compute_meta: bool=True) -> None: save_as_csv(self, file_prefix: str=None, path: str=None, keys: list=None, verbose: bool=False) -> None: save_as_hdf(self, filename: str, path: str=None, keys: list=None, verbose: bool=False) -> None:

Parameters

data : DataFrame
Original expression data.
annotation : Series
Annotation to group cells by.
dtype : str, optional (default: "float32)
Datatype to use.
remove_non_expressed : bool, optional (default: True)
Remove non-expressed genes from dataframe.
normalize : bool, optional (default: True)
Normalize data.
anova : bool, optional (default: True)
Perform anova to remove genes with low variance.
Expand source code
class ESObject(object):
    """A class that integrates the CELLEX workflow to compute ES

    Workflow: filter non-expressed, normalize, filter non-varying, compute
    ESw, compute p-values, compute ESw*, compute ESmu and ESsd


    Attributes
    ----------
    results : dict
    summary_data : SummaryData

    Methods
    -------
    compute(self, esms: list=None, verbose: bool=False, compute_meta: bool=True) -> None:
    save_as_csv(self, file_prefix: str=None, path: str=None, keys: list=None, verbose: bool=False) -> None:
    save_as_hdf(self, filename: str, path: str=None, keys: list=None, verbose: bool=False) -> None:
    """
    
    def __init__(self, 
                data: pd.DataFrame, 
                annotation: pd.Series,
                dtype: str="float32",
                remove_non_expressed: bool=True,
                normalize: bool=True,
                anova: bool=True,
                verbose: bool=False
                ):
        """
        Parameters
        ----------
        data: DataFrame
            Original expression data.
        
        annotation : Series
            Annotation to group cells by.
        
        dtype : str, optional (default: "float32)
            Datatype to use.
        
        remove_non_expressed : bool, optional (default: True)
            Remove non-expressed genes from dataframe.
        
        normalize : bool, optional (default: True)
            Normalize data.
        
        anova : bool, optional (default: True)
            Perform anova to remove genes with low variance.
        """
        
        ### Preprocessing steps
        # default: cast input datatype to type with smaller memory footprint
        data = data.astype(dtype=dtype)

        # parse data and metadata, i.e. run various quality checks
        data, annotation = utils.parse_input(data, annotation, verbose)
        
        if remove_non_expressed:
            data = preprocessing.remove_non_expressed(df=data, verbose=verbose)
        
        if normalize:
            data = preprocessing.log_normalize(df=data, verbose=verbose)
        
        if anova:
            # anova returns dict of two dataframes. Select the filtered "df"
            data = preprocessing.anova(df=data, annotation=annotation, verbose=verbose)["df"]

        self.results = {}

        ### Create SummaryData object
        self.summary_data = SummaryData(data, annotation)
    
    def compute(self,
                esms: list=None, \
                verbose: bool=False, \
                compute_meta: bool=True) -> None:
        """Compute ESw using specified ES metrics using object annotation.

        Results are stored in a dictionary with key:
            <esm>.<esw_type>

        Parameters
        ----------
        esms : list(str), optional (default: None)
            List of ES metrics to compute
        verbose : bool, optional (default: False)
            Print progress report.
        compute_meta: bool, optional (default: True)
            Compute ESmu, ESsd, and prerequisites.

        Returns
        -------
            None


        """
        
        if esms is None:
            esms = ES_METRICS

        for m in esms:
            if (m.lower() not in ES_METRICS):
                raise ValueError("No such metric: ", m.lower())

        results = {}

        for m in esms:
            esm_result = getattr(metrics, m.lower())(self.summary_data, verbose, compute_meta)
            results.update(esm_result)

        if compute_meta:
            esws = [val for key,val in results.items() if ("esw_s" in key)]
            results["esmu"] = metrics.es_mu(esws, verbose)
            results["essd"] = metrics.es_sd(esws, verbose)

        if verbose:
            print("Computed %a." % list(results.keys()))

        self.results.update(results)

        return None

    def save_as_csv(self, file_prefix: str=None, path: str=None, keys: list=None, verbose: bool=False) -> None:
        """Save results as multiple csv files

        Saves all results in self.results to a directory: /out_###
        Results include esw, esw_null, pvals, qvals and ESmu.

        Parameters
        ----------
        file_prefix : str, optional (default: None)
            Prefix to append to filenames, i.e. 
            <prefix>.<metric>.<item>.csv.gz

        path : str, optional (default: None)
            Path to save to. If None, saves to "out".

        keys : list, optional (default: all keys in self.results)
            Keys of results in self.results dictionary.
            May be used for saving only specific results.

        verbose : bool, optional (default: False)
            Print progress report.

        Returns
        -------
        None
        """
        if verbose:
            print("Saving results as csv to disk ...")
        
        if keys is None:
            keys = ["esmu", "essd"]

        if "all" in keys:
            keys = self.results.keys()

        if path is None:
            path = "out"

        if file_prefix is None:
            file_prefix = ""
        else:
            file_prefix = "{}.".format(file_prefix)

        os.makedirs(path, exist_ok=True) # make dir if it doesn't already exist

        ### Save results
        for k in keys:
            try:
                df = self.results[k]
                fp = "{}/{}{}.csv.gz".format(path, file_prefix, k)
                df.to_csv(fp, compression="gzip")
                if verbose:
                    print("  Saved: {}".format(fp))
            except KeyError:
                print("  WARNING: Key \"{}\" does not exist in ESObject results. No data saved.".format(k))

        if verbose:
            print("Finished saving results to {}".format(path))
    

    def save_as_hdf(self, filename: str, path: str=None, keys: list=None, verbose: bool=False) -> None:
        """Save results to a single hdf file

        Parameters
        ----------
        filename : str
            Filename to write to.

        path : str, optional (default: None)
            Path to save to. If None, saves to "out".

        keys : list, optional (default: all keys in self.results)
            Keys of results in self.results dictionary.
            May be used for saving only specific results.

        verbose : bool, optional (default: False)
            Print progress report.

        Returns
        -------
        None

        """
        if verbose:
            print("Saving results as hdf to disk ...")
        
        if keys is None:
            keys = self.results.keys()
        
        assert (len(keys) > 0), "No results to save in ESObject.results."

        if path is None:
            path = "out"
        
        filename = "{}/{}.h5".format(path, filename)

        os.makedirs(path, exist_ok=True) # make dir if it doesn't already exist

        with h5py.File(filename, "w-") as f:
            axis0 = self.summary_data.mean.columns.values.astype("S")
            axis1 = self.summary_data.mean.index.values.astype("S")
            f["metadata/axis0"] = axis0
            f["metadata/axis1"] = axis1

            if verbose:
                print("  Saved: metadata/axis0")
                print("  Saved: metadata/axis0")

            for k in keys:
                f["data/{}".format(k)] = self.results[k].values
                if verbose:
                    print("  Saved: data/{}".format(k))

            f.flush()

        if verbose:
            print("Finished saving results to {}".format(filename))

Methods

def compute(self, esms: list = None, verbose: bool = False, compute_meta: bool = True) -> NoneType

Compute ESw using specified ES metrics using object annotation.

Results are stored in a dictionary with key: .

Parameters

esms : list(str), optional (default: None)
List of ES metrics to compute
verbose : bool, optional (default: False)
Print progress report.
compute_meta : bool, optional (default: True)
Compute ESmu, ESsd, and prerequisites.

Returns

None
Expand source code
def compute(self,
            esms: list=None, \
            verbose: bool=False, \
            compute_meta: bool=True) -> None:
    """Compute ESw using specified ES metrics using object annotation.

    Results are stored in a dictionary with key:
        <esm>.<esw_type>

    Parameters
    ----------
    esms : list(str), optional (default: None)
        List of ES metrics to compute
    verbose : bool, optional (default: False)
        Print progress report.
    compute_meta: bool, optional (default: True)
        Compute ESmu, ESsd, and prerequisites.

    Returns
    -------
        None


    """
    
    if esms is None:
        esms = ES_METRICS

    for m in esms:
        if (m.lower() not in ES_METRICS):
            raise ValueError("No such metric: ", m.lower())

    results = {}

    for m in esms:
        esm_result = getattr(metrics, m.lower())(self.summary_data, verbose, compute_meta)
        results.update(esm_result)

    if compute_meta:
        esws = [val for key,val in results.items() if ("esw_s" in key)]
        results["esmu"] = metrics.es_mu(esws, verbose)
        results["essd"] = metrics.es_sd(esws, verbose)

    if verbose:
        print("Computed %a." % list(results.keys()))

    self.results.update(results)

    return None
def save_as_csv(self, file_prefix: str = None, path: str = None, keys: list = None, verbose: bool = False) -> NoneType

Save results as multiple csv files

Saves all results in self.results to a directory: /out_### Results include esw, esw_null, pvals, qvals and ESmu.

Parameters

file_prefix : str, optional (default: None)
Prefix to append to filenames, i.e. ...csv.gz
path : str, optional (default: None)
Path to save to. If None, saves to "out".
keys : list, optional (default: all keys in self.results)
Keys of results in self.results dictionary. May be used for saving only specific results.
verbose : bool, optional (default: False)
Print progress report.

Returns

None
 
Expand source code
def save_as_csv(self, file_prefix: str=None, path: str=None, keys: list=None, verbose: bool=False) -> None:
    """Save results as multiple csv files

    Saves all results in self.results to a directory: /out_###
    Results include esw, esw_null, pvals, qvals and ESmu.

    Parameters
    ----------
    file_prefix : str, optional (default: None)
        Prefix to append to filenames, i.e. 
        <prefix>.<metric>.<item>.csv.gz

    path : str, optional (default: None)
        Path to save to. If None, saves to "out".

    keys : list, optional (default: all keys in self.results)
        Keys of results in self.results dictionary.
        May be used for saving only specific results.

    verbose : bool, optional (default: False)
        Print progress report.

    Returns
    -------
    None
    """
    if verbose:
        print("Saving results as csv to disk ...")
    
    if keys is None:
        keys = ["esmu", "essd"]

    if "all" in keys:
        keys = self.results.keys()

    if path is None:
        path = "out"

    if file_prefix is None:
        file_prefix = ""
    else:
        file_prefix = "{}.".format(file_prefix)

    os.makedirs(path, exist_ok=True) # make dir if it doesn't already exist

    ### Save results
    for k in keys:
        try:
            df = self.results[k]
            fp = "{}/{}{}.csv.gz".format(path, file_prefix, k)
            df.to_csv(fp, compression="gzip")
            if verbose:
                print("  Saved: {}".format(fp))
        except KeyError:
            print("  WARNING: Key \"{}\" does not exist in ESObject results. No data saved.".format(k))

    if verbose:
        print("Finished saving results to {}".format(path))
def save_as_hdf(self, filename: str, path: str = None, keys: list = None, verbose: bool = False) -> NoneType

Save results to a single hdf file

Parameters

filename : str
Filename to write to.
path : str, optional (default: None)
Path to save to. If None, saves to "out".
keys : list, optional (default: all keys in self.results)
Keys of results in self.results dictionary. May be used for saving only specific results.
verbose : bool, optional (default: False)
Print progress report.

Returns

None
 
Expand source code
def save_as_hdf(self, filename: str, path: str=None, keys: list=None, verbose: bool=False) -> None:
    """Save results to a single hdf file

    Parameters
    ----------
    filename : str
        Filename to write to.

    path : str, optional (default: None)
        Path to save to. If None, saves to "out".

    keys : list, optional (default: all keys in self.results)
        Keys of results in self.results dictionary.
        May be used for saving only specific results.

    verbose : bool, optional (default: False)
        Print progress report.

    Returns
    -------
    None

    """
    if verbose:
        print("Saving results as hdf to disk ...")
    
    if keys is None:
        keys = self.results.keys()
    
    assert (len(keys) > 0), "No results to save in ESObject.results."

    if path is None:
        path = "out"
    
    filename = "{}/{}.h5".format(path, filename)

    os.makedirs(path, exist_ok=True) # make dir if it doesn't already exist

    with h5py.File(filename, "w-") as f:
        axis0 = self.summary_data.mean.columns.values.astype("S")
        axis1 = self.summary_data.mean.index.values.astype("S")
        f["metadata/axis0"] = axis0
        f["metadata/axis1"] = axis1

        if verbose:
            print("  Saved: metadata/axis0")
            print("  Saved: metadata/axis0")

        for k in keys:
            f["data/{}".format(k)] = self.results[k].values
            if verbose:
                print("  Saved: data/{}".format(k))

        f.flush()

    if verbose:
        print("Finished saving results to {}".format(filename))