Module cellex.plotting.n_es_genes

Expand source code
import numpy as np
import pandas as pd
import plotnine as p9

def n_es_genes(df: pd.DataFrame, 
                annotation: pd.Series,
                figsize: tuple=None) -> p9.ggplot:
    """Plot distribution of number of ES genes per group
    
    Computes the number of ES genes per column, e.g. cell(-type) 
    and plots the distribution for the groups specified
    by the annotation.
    
    Parameters
    ----------
    df : DataFrame
        Dataframe containing positive ES weights, ideally use only ESmu.
    annotation : Series
        Annotation to group dataframe cell(-types) by in the violin plots.
    figsize : (float, float), optional (default: None)
        Specify width and height of plot.
    
    Returns
    -------
    p : ggplot
        A plotnine ggplot

    """
    
    ### Count number of non-zero values, i.e. ESw > 0
    df = df.astype(bool).sum(axis=0)
    
    ### Map column labels to annotation
    if type(annotation) is pd.DataFrame:
        annotation = annotation.iloc[:,0]
    
    # remove duplicates
    annotation = annotation.loc[~annotation.index.duplicated(keep='first')]

    df.index = df.index.map(annotation, na_action="ignore").values.astype(str)
    
    # Constants, height and width of plot.
    if figsize is None:
        W = min((df.index.nunique(), 10))
        H = 6.4 # plotnine default height
    else:
        W, H = figsize

    ### Convert to tidy / long format
    # Org:
    #       ABC  ACBG  ACMB
    # POMC  0.0   0.5   0.9
    # AGRP  0.2   0.0   0.0
    # LEPR  0.1   0.1   0.4
    
    # Tidy:
    #   gene_name annotation    es_weight
    # 1 POMC      ABC           0.0
    # 2 AGRP      ABC           0.6
    # 3 LEPR      ABC           1.0      
    df_tidy = df.copy()
    df_tidy.index.name = None
    df_tidy = pd.melt(df_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="count")
    
    ### Compute the mean count of ES genes
    mean_count = df_tidy["count"].mean(axis=0)
    
    ### Plot
    p = (
        ### data
        p9.ggplot(data=df_tidy, 
                  mapping=p9.aes(x="index", y="count", fill="index", label="index"), 
                 )

        ### theming
        + p9.theme_classic()
        + p9.theme(
            figure_size = (W,H),
            axis_text_x = p9.element_text(rotation=75)
        )

        + p9.labs(
            x="", # e.g. "Cell-type"
            y="Number of ES genes", # e.g. "ES weight"
        )
        
        ### viz
        + p9.geom_violin(scale="width", show_legend=False)
        + p9.geom_jitter(width=0.1, height=0, show_legend=False)
        + p9.geom_hline(yintercept=mean_count, color="blue", linetype="dashed", show_legend=False)
    )
    
    return p

Functions

def n_es_genes(df: pandas.core.frame.DataFrame, annotation: pandas.core.series.Series, figsize: tuple = None) -> plotnine.ggplot.ggplot

Plot distribution of number of ES genes per group

Computes the number of ES genes per column, e.g. cell(-type) and plots the distribution for the groups specified by the annotation.

Parameters

df : DataFrame
Dataframe containing positive ES weights, ideally use only ESmu.
annotation : Series
Annotation to group dataframe cell(-types) by in the violin plots.
figsize : (float, float), optional (default: None)
Specify width and height of plot.

Returns

p : ggplot
A plotnine ggplot
Expand source code
def n_es_genes(df: pd.DataFrame, 
                annotation: pd.Series,
                figsize: tuple=None) -> p9.ggplot:
    """Plot distribution of number of ES genes per group
    
    Computes the number of ES genes per column, e.g. cell(-type) 
    and plots the distribution for the groups specified
    by the annotation.
    
    Parameters
    ----------
    df : DataFrame
        Dataframe containing positive ES weights, ideally use only ESmu.
    annotation : Series
        Annotation to group dataframe cell(-types) by in the violin plots.
    figsize : (float, float), optional (default: None)
        Specify width and height of plot.
    
    Returns
    -------
    p : ggplot
        A plotnine ggplot

    """
    
    ### Count number of non-zero values, i.e. ESw > 0
    df = df.astype(bool).sum(axis=0)
    
    ### Map column labels to annotation
    if type(annotation) is pd.DataFrame:
        annotation = annotation.iloc[:,0]
    
    # remove duplicates
    annotation = annotation.loc[~annotation.index.duplicated(keep='first')]

    df.index = df.index.map(annotation, na_action="ignore").values.astype(str)
    
    # Constants, height and width of plot.
    if figsize is None:
        W = min((df.index.nunique(), 10))
        H = 6.4 # plotnine default height
    else:
        W, H = figsize

    ### Convert to tidy / long format
    # Org:
    #       ABC  ACBG  ACMB
    # POMC  0.0   0.5   0.9
    # AGRP  0.2   0.0   0.0
    # LEPR  0.1   0.1   0.4
    
    # Tidy:
    #   gene_name annotation    es_weight
    # 1 POMC      ABC           0.0
    # 2 AGRP      ABC           0.6
    # 3 LEPR      ABC           1.0      
    df_tidy = df.copy()
    df_tidy.index.name = None
    df_tidy = pd.melt(df_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="count")
    
    ### Compute the mean count of ES genes
    mean_count = df_tidy["count"].mean(axis=0)
    
    ### Plot
    p = (
        ### data
        p9.ggplot(data=df_tidy, 
                  mapping=p9.aes(x="index", y="count", fill="index", label="index"), 
                 )

        ### theming
        + p9.theme_classic()
        + p9.theme(
            figure_size = (W,H),
            axis_text_x = p9.element_text(rotation=75)
        )

        + p9.labs(
            x="", # e.g. "Cell-type"
            y="Number of ES genes", # e.g. "ES weight"
        )
        
        ### viz
        + p9.geom_violin(scale="width", show_legend=False)
        + p9.geom_jitter(width=0.1, height=0, show_legend=False)
        + p9.geom_hline(yintercept=mean_count, color="blue", linetype="dashed", show_legend=False)
    )
    
    return p