"""
Automatic annotator for tumor single cell data.
#
# """
### MetaTiME annotator
import pandas as pd
import anndata
from metatime import config
import scanpy as sc
from typing import Optional, Union
from metatime import mecs
import warnings
########### top1 cell state annotator #######
[docs]def annotator( dat: Union[pd.DataFrame, anndata.AnnData],
mecnamedict: dict,
gcol ='overcluster',
#MINCELL = 5
):
"""
Annotator marking top1st enriched cell states based on MeC scores.
Parameters
----------
dat
dataframe with gene by cell scores and one column gcol (grouping column) e.g. leiden class
or,
anndata with gene by cell scores, and one column gcol in dat.obs
Malignant cells shall be removed to keep tumor microenvironmental cells only, such as immune cells, fibroblasts, endothelial cells
mecnamedict
for renaming dat columns into functional names
can be loaded from pre-computed tumor MeC functional annotation
gcol
for grouping cells. e.g. a column for overclustered cluster assignment.
Returns
----------
projmat: pd.DataFrame
Dataframe with mec projection scores and newly added predicted label column 'MetaTiME_'+gcol
gpred
median score of each gcol group
gpreddict
dictionary how is each gcol mapped to a label
Examples
----------
>>> projmat, gpred, gpreddict = annotator( projmat , mecnamedict)
"""
if (isinstance( dat , anndata.AnnData)):
# subselecting columns
pdata = dat.copy()
Xproj = pdata.to_df()
#projmat = Xproj[Xproj.columns.intersection(config.scorectcols)].join(pdata.obs[[ gcol ]])
projmat = Xproj.join(pdata.obs[[ gcol ]])
else:
projmat = dat.copy()
projmat = projmat[ list( mecnamedict.keys() ) + [ gcol ]]
# mean or median score for each group
gpred = projmat.groupby( gcol ).median() # gpred: leiden cluster by mec.
# top 1 score for each group, with filtering
gpreddict = projmat.groupby( gcol ).median().idxmax(axis=1).apply(lambda t: mecnamedict[t])# cluster max quantile 75% larger than zscore-2
pass_maxmedian = (projmat.groupby( gcol ).quantile(q=0.7) >1 ).max(axis=1) # given median is max, top 30%cells shall pass zscore 1. Otherwise this may be a noisy group, setting to 'Others'
gpreddict[ ~ pass_maxmedian] = 'Others'
# generate top1 prediction
projmat['MetaTiME_'+gcol] = projmat[ gcol ].apply(lambda t: gpreddict[t])
# add to anndata
if (isinstance( dat , anndata.AnnData)):
if('MetaTiME_'+gcol in pdata.obs.columns):
del pdata.obs['MetaTiME_'+gcol]
pdata.obs['MetaTiME_'+gcol] = projmat['MetaTiME_'+gcol]
pdata.obs['MetaTiME_'+gcol]=pd.Categorical(pdata.obs['MetaTiME_'+gcol])
dat = pdata
else:
dat = projmat
return( dat, gpred, gpreddict )
[docs]def overcluster(adata: anndata.AnnData,
resolution : float=8,
random_state: int= 0,
clustercol :str = 'overcluster'):
"""
Overcluster single cell data to get cluster level cell state annotation
Parameters
----------
adata
scanpy object with adata.uns['neighbors'] computed.
if adata.obsm['X_umap'] does not exist, recomputes umap coordinates.
otherwise, keep the umap coordinates
resolution
clustering resolution
random_state
clustering random state
clustercol:
key to add to adata.obs that records cluster assignment
Returns
----------
scanpy object with clustering results.
"""
sc.tl.leiden(adata, resolution=resolution, key_added = clustercol, random_state=random_state)
if 'X_umap' not in adata.obsm.keys():
sc.tl.umap(adata)
return(adata)
[docs]def pdataToTable( pdata: anndata.AnnData,
mectable: pd.DataFrame,
gcol : str= 'overcluster'):
"""
Convert projected scores to two simple pandas dataframes
Parameters
----------
pdata
anndata with gene by mec scores, and one column gcol in pdata.obs
mectable
for renaming dat columns into functional names
can be loaded from pre-computed tumor MeC functional annotation
Required columns: `['MeC_id', 'Annotation', 'UseForCellTypeAnno']`
gcol
a column in pdata.obs for grouping cells. a column for overclustered cluster assignment.
Returns
----------
tuple
projmat : a pandas dataframe with mec scores and a column for grouping cells. useful for annotating cell states.
mecscores: a pandas dataframe for per-cell mec scores. columns use functional annotation of mec ids
"""
# projmat
projmat = pdata.obs.join( pdata.to_df() )
allmecids = mectable['MeC_id'].values.tolist()
projmat = projmat[ allmecids + [ gcol ] ]
# mecscores
mecscores = pdata.to_df()
mecctnamedict = mecs.getmecnamedict_ct( mectable )
newcols = []
for t in projmat.columns:
if(t in mecctnamedict.keys() ):
newcols.append( mecctnamedict[t] )
else:
newcols.append(t)
mecscores = projmat.copy()
mecscores.columns = newcols
return(projmat, mecscores)
[docs]def saveToAdata( adata : anndata.AnnData,
projmat : pd.DataFrame,
gcol: str = 'overcluster',
ANNOTATION_ONLY : bool= False
):
"""
Save annotation to adata.
Parameters
----------
adata: anndata.AnnData
scRNA scanpy object
projmat: pd.DataFrame
Dataframe with mec projection scores and newly added predicted label column 'MetaTiME_'+gcol
gcol
A column in projmat for grouping cells. Typically a column for overclustered cluster assignment.
ANNOTATION_ONLY
Whether to only add cluster-wise annotation to anndata or Also append scores to adata.obs
Returns
----------
anndata.AnnData
adata with per-cluster annotation column in adata.obs[[gcol]], and scores appended to adata.obs if ANNOTATION_ONLY==True
"""
exist_cols = projmat.columns.intersection(adata.obs.columns)
if( len(exist_cols ) >0 ):
warnings.warn('Columns in projmat already overlap columns in adata.obs. Overwriting adata.obs. '
)
adata.obs = adata.obs[[ t for t in adata.obs.columns if t not in exist_cols] ]
if( ANNOTATION_ONLY ):
adata.obs['MetaTiME_'+gcol] = projmat['MetaTiME_'+gcol]
else:
adata.obs = adata.obs.merge( projmat , how = 'left', left_index = True, right_index = True)
return( adata )
[docs]def saveToPdata( pdata : anndata.AnnData,
adata: anndata.AnnData,
projmat : pd.DataFrame,
gcol: str = 'overcluster',
BORROW_ADATA_EMBEDDING=True,
):
"""
Save annotation to pdata.
Borrow embedding from adata for easy visualization, including adata.obsm['X_pca'], adata.obsm['X_umap'], adata.obsm['X_pca_harmony']
Parameters
----------
pdata : anndata.AnnData
scanpy object for per-cell projected score.
adata: anndata.AnnData
scRNA scanpy object
projmat: pd.DataFrame
Dataframe with mec projection scores and newly added predicted label column 'MetaTiME_'+gcol
gcol
A column in projmat for grouping cells. Typically a column for overclustered cluster assignment.
BORROW_ADATA_EMBEDDING
Whether to borrow pca and umap embeddings from adata to write in pdata. For easy visualization.
Returns
----------
anndata.AnnData
pdata with per-cluster annotation column in pdata.obs[[gcol]].
"""
if(BORROW_ADATA_EMBEDDING):
pdata.obsm['X_pca'] = adata.obsm['X_pca']
pdata.obsm['X_umap'] = adata.obsm['X_umap']
if('X_pca_harmony' in adata.obsm):
pdata.obsm['X_pca_harmony'] = adata.obsm['X_pca_harmony']
pdata.obs[gcol] = projmat[[gcol]]
pdata.obs['MetaTiME_'+gcol] = projmat['MetaTiME_'+gcol]
return( pdata )