Source code for metatime.mecmapper

import pandas as pd
import os
import scanpy as sc
import numpy as np
import warnings
import anndata



############ project single cell to mec for creating new adata obj ###########

[docs]def projectMec(expr, mec, glstcol = 'TopGene20'):

    """ 
    Parameters
    ----------
    expr
        expression matrix, cell by gene. 
    mec
        mec table with one column, two types of format accepted
        format 1, each row is a mec with genes , comma seperated.
        format 2, each row is a genes, each column is a mec, value is float
    glstcol
        Used only when mec is the list format format1, and glstcol is the column name to record comma separated gene list for each mec.

    Returns
    ----------
    scorepd, cell by signature.

    Examples
    ---------
    >>> scorepd = mecmapper.projectMec( df, mec )
    
    """
    score_lst = []
   
    if(isinstance( mec.iloc[0,0], str) ) :
        # the mec format is format 1. no weighting.
        for i in mec.index:
            mec_i = str(i) 
            gs = mec.loc[i, glstcol ].split(',')
            gs_sub = [ t for t in expr.columns if t in gs ]
            score = pd.DataFrame( expr[ gs_sub ].mean(axis=1), columns= [ mec_i ] )
            score_lst.append( score  )
        scorepd = pd.concat(score_lst , axis=1).fillna(0)
    else:
        # the mec format is not format 1. using values as weight.
        shared_genes = mec.index.intersection( expr.columns )
        module_sharedgenes = mec.loc[ shared_genes ]
        expr_sharedgenes = expr[ shared_genes ]
        scorearray = expr_sharedgenes.values.dot( module_sharedgenes.values )
        scorepd = pd.DataFrame( scorearray, index = expr.index, columns = mec.columns)
        
    return( scorepd )



[docs]def annToDataFrame( adata_input, genescaling = False,  layer = 'norm_data'):
    """ 
    Extract expression matrix from scanpy object.
    Parameters
    ----------
    adata_input: anndata.AnnData
        Input scanpy object. 
    genescaling: bool
        Whether to z-scale extracted feature matrix.
    layer: str
        Layer of expression to extract from adata_input

    Returns
    ----------
    pd.dataframe
        Extracted expression matrix from scanpy object.
        
    """
    adata = adata_input.copy()
    try:
        df = adata.to_df(layer = layer )
    except:
        warnings.warn('Warning: no '+layer +' layer. using X.')
        df = adata.to_df()
    if(  genescaling ):
        df = (df - df.mean())/df.std()
    
    return( df )


[docs]def scale(df):
    """ standardize scaling feature """
    return(  (df-df.mean())/df.std()  )

[docs]def projectMecAnn( adata_input, 
                mec, 
                genescaling = False,  
                sigscaling = True, 
                addon = False, 
                layer = 'norm_data', 
                glstcol = 'TopGene20'):
    """
    Project single cell expression in AnnData to MeCs
    Calls: projectMeC,annToDataFrame

    Parameters
    ----------
    adata_input: anndata.AnnData
        Input scanpy object for gene expression
    mec: pd.DataFrame
        mec table with one column, two types of format both accepted
        format 1, each row is a mec with genes , comma seperated.
        format 2, each row is a genes, each column is a mec, value is float.
    genescaling: bool
        Whether to scale expression on gene level. Recommended to be False.
    sigscaling: bool
        Whether to scale projected scores across cells. Recomended and default is True.
    addon: bool
        Whether to keep the original adata and append the signautres in obs, or return an independent anndata (pdata) with only projected values (which saves memory).
    layer: str
        Layer of expression to extract from adata_input
    glstcol
        Used only when mec is the list format format 1, and glstcol is the column name to record comma separated gene list for each mec.


    Returns
    ----------
    anndata.AnnData
        If addon is False, return pdata where values are MeC-projected values. 
        If addon is True, return adata where values are same as in adata_input, but with extra obs columns.
    Examples
    ----------
    >>> pdata = mecmapper.projectMecAnn(adata, mec_score_topg, sigscaling=True, genescaling=False, addon=False)
    """

    adata = adata_input.copy()
    df = annToDataFrame( adata, genescaling = genescaling, layer = layer )    
    
    projected = projectMec(df, mec, glstcol = glstcol )
    projected.columns = [ str(t) for t in projected.columns ]
    if( sigscaling ):
        projected = scale(projected)
        
    if(addon):
        if( len(projected.columns.intersection(adata.obs.columns)) >0 ):
            warnings.warn('signature names already in adata.obs. cannot merge. If wish to remove in obs: \n \
                adata.obs = adata.obs[[t for t in adata.obs.columns if t not in mecnamedict.keys()]  \n or \
                    adata.obs = adata.obs[[t for t in adata.obs.columns if t[:6]!="score_" ]]]'
                )
        else:
            adata.obs = adata.obs.merge( projected , how = 'left', left_index = True, right_index = True)
        adataproj = adata.copy()
    else:
        adataproj = anndata.AnnData( projected, obs = adata.obs )
    return(adataproj)


[docs]def projectModuleAnn_aucell( adata, module, glstcol = 'TopGene20'):
    """ 
    Alternative function that projects scRNA data using top genes from MeCs and AUCell
    module has to be list mode.

    Parameters
    ----------
    adata: anndata.AnnData
        Input scanpy object for gene expression
    module: pd.DataFrame
        mec table with one column, two types of format accepted
        format 1, each row is a mec with genes , comma seperated.
    glstcol
        Used only when mec is the list format format 1, and glstcol is the column name to record comma separated gene list for each mec.

    Returns
    ----------
    anndata.AnnData
        adata with aucell score stored in extra columns starting with 'score_sig_' in adata.obs 

    
    """
    
    if(not glstcol): 
        glstcol = module.columns[0]
    for i in module.index:
        gene_list = module.loc[i,  glstcol ].split(',')
        use_gene_list = adata.var_names.intersection( gene_list )
        if(len(use_gene_list)>1):
            sc.tl.score_genes(adata, use_gene_list, score_name = 'score_sig_'+str(i) )
        else:
            adata.obs[ 'score_sig_'+str(i) ] = 0
    return(adata)

"""
### TODO. 
# Tutorial. mapper+annotator.
# Tutorial. differential signature.
# Tutorial functional extraction.
# metatime calling. (code to be wrapped in pipeline with a tutorial, that's it. )
# A class for projected data. this is per-cell class.
#init: score matrix, .obs features, cell location (adata.umap). function and category for features (mecs class.).
#fun:  plot single singature signature using mec name. plot all signature. plot all signature in
# 
#THEN, a seperate annotator function. takes projected class. takes adata input ( overclustering. )
# THEN, a separate differential signature comparison. 
# mecs class: print top gene, plot top gene, extract top tf, extract enrichr, extract lisa ranking, extract lisa-mec ranking plot. 
"""