Module cellex.preprocessing.remove_non_expressed

Expand source code
import numpy as np
import pandas as pd
import time
import datetime

def remove_non_expressed(df: pd.DataFrame, verbose: bool=False):
    """
    Identifies and removes genes with mean 0 from the dataframe,
    i.e. non-expressed genes.
    
    Parameters
    ----------
    df : DataFrame
        Gene expression data, e.g. UMI counts.
    
    verbose : bool, optional (default: False)
        Print progress report.
    
    Returns
    -------
    df_filtered : DataFrame
        Filtered data with no genes with mean 0 expression across all cells.

    NOTE:
    * df format – rows: genes, cols: cells.
              A     B     C
        POMC  0.0   0.5   0.9
        AGRP  0.2   0.0   0.0
        LEPR  0.1   0.1   0.4

    TODO:
    * consider a faster way of determining this. Perhaps just check if sum > 0?
    * verbose printing
    """
    start = 0

    if verbose:
        start = time.time()
        print("Preprocessing - running remove_non_expressed ... ", end='')
    
    mask = (df.sum(axis=1) / df.shape[1]) != 0

    df_filtered = df[mask]

    if verbose:
        n_genes_org = len(df)
        n_genes_filtered = len(df_filtered)
        td = datetime.timedelta(seconds=(time.time() - start))
        print("excluded %d / %d genes in %d min %d sec" % ((n_genes_org - n_genes_filtered), n_genes_org, *divmod(td.seconds, 60)))

    return df_filtered

Functions

def remove_non_expressed(df: pandas.core.frame.DataFrame, verbose: bool = False)

Identifies and removes genes with mean 0 from the dataframe, i.e. non-expressed genes.

Parameters

df : DataFrame
Gene expression data, e.g. UMI counts.
verbose : bool, optional (default: False)
Print progress report.

Returns

df_filtered : DataFrame
Filtered data with no genes with mean 0 expression across all cells.
NOTE:
 
  • df format – rows: genes, cols: cells. A B C POMC 0.0 0.5 0.9 AGRP 0.2 0.0 0.0 LEPR 0.1 0.1 0.4
TODO:
 
  • consider a faster way of determining this. Perhaps just check if sum > 0?
  • verbose printing
Expand source code
def remove_non_expressed(df: pd.DataFrame, verbose: bool=False):
    """
    Identifies and removes genes with mean 0 from the dataframe,
    i.e. non-expressed genes.
    
    Parameters
    ----------
    df : DataFrame
        Gene expression data, e.g. UMI counts.
    
    verbose : bool, optional (default: False)
        Print progress report.
    
    Returns
    -------
    df_filtered : DataFrame
        Filtered data with no genes with mean 0 expression across all cells.

    NOTE:
    * df format – rows: genes, cols: cells.
              A     B     C
        POMC  0.0   0.5   0.9
        AGRP  0.2   0.0   0.0
        LEPR  0.1   0.1   0.4

    TODO:
    * consider a faster way of determining this. Perhaps just check if sum > 0?
    * verbose printing
    """
    start = 0

    if verbose:
        start = time.time()
        print("Preprocessing - running remove_non_expressed ... ", end='')
    
    mask = (df.sum(axis=1) / df.shape[1]) != 0

    df_filtered = df[mask]

    if verbose:
        n_genes_org = len(df)
        n_genes_filtered = len(df_filtered)
        td = datetime.timedelta(seconds=(time.time() - start))
        print("excluded %d / %d genes in %d min %d sec" % ((n_genes_org - n_genes_filtered), n_genes_org, *divmod(td.seconds, 60)))

    return df_filtered