Module `cellex.utils.parse_input`

Expand source code

import numpy as np
import pandas as pd
import datetime
import time

def parse_input(data: pd.DataFrame, annotation, verbose: bool=False) -> None:
    """Parse input data and annotation
    
    Checks that data and annotation / metadata match.
    Checks for duplicates.
    Appends _dp{n} to duplicated cell id's.
    
    
    Parameters
    ----------
    data : DataFrame
        Original expression data.
    
    annotation : ndarray, Series, DataFrame
        Annotation to group cells by.
    
    verbose : bool, optional (default: False)
        Print progress report.
    
    Returns
    -------
    data, annotation : (DataFrame, ndarray)
        Data and annotation with checked index/column headers
    
    """
    start = 0

    if verbose:
        start = time.time()
        print("Preprocessing - checking input ... ", end='')
    
    # Check input length
    assert (len(annotation) == data.shape[1]), "Number of annotations do not match number of cells."
    
    # If Series, relevant to check if index matches data cols
    if type(annotation) is pd.Series:
        assert all(np.sort(data.columns.values) == np.sort(annotation.index.values)), "Data columns and annotation index values do not match 1:1."

    # Turn annotation into Series
    # dataframe --> series
    if type(annotation) is pd.DataFrame:
        assert (annotation.shape[1] == 1), "DataFrame annotation had unexpected number of columns {}".format(annotation.shape[1])
        annotation = annotation.iloc[:,0]
    
    # numpy array --> series
    if type(annotation) is np.ndarray:
        annotation = pd.Series(data=annotation, index=data.columns.values)
    
    # Handle duplicates
    if any(data.columns.duplicated()):
        
        if verbose:
            print("\n  duplicate cell id's detected ... ", end='')
        
        cols = pd.Series(data.columns)
        dups = cols[cols.duplicated()].unique()
        n_dups = sum([i in dups for i in cols.values])
        
        # iterate over dups and append suffix _dp{0} .. _dp{n}
        for d in dups:
            # check for differing type-annotations
            types = annotation.values[annotation.index == d]
            if verbose and (len(np.unique(types)) > 1):
                print("\n  duplicated id {uid} has >1 type-annotation: {t}".format(uid=d, t=types), end='')
            
            # add suffix
            mask = cols == d
            names = [d + '_dp' + str(i) for i in range(sum(mask))]
            data.columns.values[mask] = names
            annotation.index.values[mask] = names
            
        if verbose:
            print("\n  {n} duplicate id's renamed ... ".format(n=n_dups), end='')

    # Reduce annotation to sorted ndarray
    # series --> numpy array
    if type(annotation) is pd.Series:
        annotation = data.columns.map(annotation, na_action="ignore").values.astype(str)

    if verbose:
        td = datetime.timedelta(seconds=(time.time() - start))
        print("input parsed in %d min %d sec" % (divmod(td.seconds, 60)))
        
    return data, annotation

Functions

def parse_input(data: pandas.core.frame.DataFrame, annotation, verbose: bool = False) -> NoneType

Parse input data and annotation

Checks that data and annotation / metadata match. Checks for duplicates. Appends _dp{n} to duplicated cell id's.

Parameters

data : DataFrame: Original expression data.
annotation : ndarray, Series, DataFrame: Annotation to group cells by.
verbose : bool, optional (default: False): Print progress report.

Returns

data, annotation : (DataFrame, ndarray): Data and annotation with checked index/column headers

Expand source code

def parse_input(data: pd.DataFrame, annotation, verbose: bool=False) -> None:
    """Parse input data and annotation
    
    Checks that data and annotation / metadata match.
    Checks for duplicates.
    Appends _dp{n} to duplicated cell id's.
    
    
    Parameters
    ----------
    data : DataFrame
        Original expression data.
    
    annotation : ndarray, Series, DataFrame
        Annotation to group cells by.
    
    verbose : bool, optional (default: False)
        Print progress report.
    
    Returns
    -------
    data, annotation : (DataFrame, ndarray)
        Data and annotation with checked index/column headers
    
    """
    start = 0

    if verbose:
        start = time.time()
        print("Preprocessing - checking input ... ", end='')
    
    # Check input length
    assert (len(annotation) == data.shape[1]), "Number of annotations do not match number of cells."
    
    # If Series, relevant to check if index matches data cols
    if type(annotation) is pd.Series:
        assert all(np.sort(data.columns.values) == np.sort(annotation.index.values)), "Data columns and annotation index values do not match 1:1."

    # Turn annotation into Series
    # dataframe --> series
    if type(annotation) is pd.DataFrame:
        assert (annotation.shape[1] == 1), "DataFrame annotation had unexpected number of columns {}".format(annotation.shape[1])
        annotation = annotation.iloc[:,0]
    
    # numpy array --> series
    if type(annotation) is np.ndarray:
        annotation = pd.Series(data=annotation, index=data.columns.values)
    
    # Handle duplicates
    if any(data.columns.duplicated()):
        
        if verbose:
            print("\n  duplicate cell id's detected ... ", end='')
        
        cols = pd.Series(data.columns)
        dups = cols[cols.duplicated()].unique()
        n_dups = sum([i in dups for i in cols.values])
        
        # iterate over dups and append suffix _dp{0} .. _dp{n}
        for d in dups:
            # check for differing type-annotations
            types = annotation.values[annotation.index == d]
            if verbose and (len(np.unique(types)) > 1):
                print("\n  duplicated id {uid} has >1 type-annotation: {t}".format(uid=d, t=types), end='')
            
            # add suffix
            mask = cols == d
            names = [d + '_dp' + str(i) for i in range(sum(mask))]
            data.columns.values[mask] = names
            annotation.index.values[mask] = names
            
        if verbose:
            print("\n  {n} duplicate id's renamed ... ".format(n=n_dups), end='')

    # Reduce annotation to sorted ndarray
    # series --> numpy array
    if type(annotation) is pd.Series:
        annotation = data.columns.map(annotation, na_action="ignore").values.astype(str)

    if verbose:
        td = datetime.timedelta(seconds=(time.time() - start))
        print("input parsed in %d min %d sec" % (divmod(td.seconds, 60)))
        
    return data, annotation