Module cellex.preprocessing.log_normalize

Expand source code
import numpy as np
import pandas as pd
import time
import datetime

def log_normalize(df: pd.DataFrame, constant: int=1, scale: int=1e4, verbose: bool=False):
    """
    Normalizes data by scaling and log-transformation
    Default scale is 10,000
    
    Parameters
    ----------
    df : DataFrame
        Raw expression values, e.g. UMI counts.
    
    constant : int, optional (default: 1)
        Constant added before taking log.
    
    scale : float, optional (default: 1e4)
        Scale factor for common transcript count.
    
    verbose : bool, optional (default: False)
        Print progress report.

    Returns
    -------
    df_lognorm : DataFrame
        Log normalized expression values

    NOTE:
    * df format – rows: genes, cols: cells.
              A     B     C
        POMC  0     2     1
        AGRP  3     0     0
        LEPR  1     1     4
    
    TODO:
    * verbose printing
    """
    start = 0

    if verbose:
        start = time.time()
        print("Preprocessing - normalizing data ... ", end='')

    # divide df by column sums to normalize values per cell
    # scale df values by scale (default: 1e4)
    # add 1 to all values in df to avoid div/0 errors in log
    # take log of values in df
    
    df_lognorm = np.log((df / df.sum(axis=0) * scale) + constant)
    
    if verbose:
        td = datetime.timedelta(seconds=(time.time() - start))
        print("data normalized in %d min %d sec" % (divmod(td.seconds, 60)))

    return df_lognorm

Functions

def log_normalize(df: pandas.core.frame.DataFrame, constant: int = 1, scale: int = 10000.0, verbose: bool = False)

Normalizes data by scaling and log-transformation Default scale is 10,000

Parameters

df : DataFrame
Raw expression values, e.g. UMI counts.
constant : int, optional (default: 1)
Constant added before taking log.
scale : float, optional (default: 1e4)
Scale factor for common transcript count.
verbose : bool, optional (default: False)
Print progress report.

Returns

df_lognorm : DataFrame
Log normalized expression values
NOTE:
 
  • df format – rows: genes, cols: cells. A B C POMC 0 2 1 AGRP 3 0 0 LEPR 1 1 4
TODO:
 
  • verbose printing
Expand source code
def log_normalize(df: pd.DataFrame, constant: int=1, scale: int=1e4, verbose: bool=False):
    """
    Normalizes data by scaling and log-transformation
    Default scale is 10,000
    
    Parameters
    ----------
    df : DataFrame
        Raw expression values, e.g. UMI counts.
    
    constant : int, optional (default: 1)
        Constant added before taking log.
    
    scale : float, optional (default: 1e4)
        Scale factor for common transcript count.
    
    verbose : bool, optional (default: False)
        Print progress report.

    Returns
    -------
    df_lognorm : DataFrame
        Log normalized expression values

    NOTE:
    * df format – rows: genes, cols: cells.
              A     B     C
        POMC  0     2     1
        AGRP  3     0     0
        LEPR  1     1     4
    
    TODO:
    * verbose printing
    """
    start = 0

    if verbose:
        start = time.time()
        print("Preprocessing - normalizing data ... ", end='')

    # divide df by column sums to normalize values per cell
    # scale df values by scale (default: 1e4)
    # add 1 to all values in df to avoid div/0 errors in log
    # take log of values in df
    
    df_lognorm = np.log((df / df.sum(axis=0) * scale) + constant)
    
    if verbose:
        td = datetime.timedelta(seconds=(time.time() - start))
        print("data normalized in %d min %d sec" % (divmod(td.seconds, 60)))

    return df_lognorm