Source code for Garfield.preprocessing.preprocess_utils

import os
import io
import pkgutil

import scanpy as sc
import anndata as ad

from anndata import AnnData
from muon import MuData

import numpy as np
import pandas as pd
import scipy
from scipy.sparse import csr_matrix
from scipy.sparse import issparse, csr
from sklearn.preprocessing import MaxAbsScaler

# from ._pca import select_pcs_features
from ._utils import reindex, gene_scores, TFIDF_LSI, clr_normalize_each_cell, determine_species
from .adj_construction import create_adj, graph_construction

CHUNK_SIZE = 20000


def batch_scale(adata, use_rep="X", batch_key="batch", chunk_size=CHUNK_SIZE):
    """
    Batch-specific scale data

    Parameters
    ----------
    adata
        AnnData
    use_rep
        use '.X' or '.obsm'
    chunk_size
        chunk large data into small chunks

    """
    for b in adata.obs[batch_key].unique():
        idx = np.where(adata.obs[batch_key] == b)[0]
        if use_rep == "X":
            scaler = MaxAbsScaler(copy=False).fit(adata.X[idx])
            for i in range(len(idx) // chunk_size + 1):
                adata.X[idx[i * chunk_size : (i + 1) * chunk_size]] = scaler.transform(
                    adata.X[idx[i * chunk_size : (i + 1) * chunk_size]]
                )
        else:
            scaler = MaxAbsScaler(copy=False).fit(adata.obsm[use_rep][idx])
            for i in range(len(idx) // chunk_size + 1):
                adata.obsm[use_rep][
                    idx[i * chunk_size : (i + 1) * chunk_size]
                ] = scaler.transform(
                    adata.obsm[use_rep][idx[i * chunk_size : (i + 1) * chunk_size]]
                )

    return adata


[docs] def preprocessing_rna( adata: AnnData, min_features: int = 600, min_cells: int = 3, target_sum: int = 10000, used_hvgs: bool = True, used_pca_graph: bool = True, rna_n_top_features=2000, # or gene list n_components: int = 50, n: int = 15, batch_key: str = "batch", metric: str = "euclidean", svd_solver: str = "arpack", keep_mt: bool = False, backed: bool = False, ): """ Preprocessing single-cell RNA-seq data Parameters ---------- adata An AnnData matrice of shape n_obs x n_vars. Rows correspond to cells and columns to genes. min_features Filtered out cells that are detected in less than n genes. Default: 600. min_cells Filtered out genes that are detected in less than n cells. Default: 3. target_sum After normalization, each cell has a total count equal to target_sum. If None, total count of each cell equal to the median of total counts for cells before normalization. rna_n_top_features Number of highly-variable genes to keep. Default: 2000. Return ------- The AnnData object after preprocessing. """ if min_features is None: min_features = 600 if rna_n_top_features is None: rna_n_top_features = 2000 if target_sum is None: target_sum = 10000 # Preprocessing # if not issparse(adata.X): if type(adata.X) != csr.csr_matrix and (not backed) and (not adata.isbacked): adata.X = scipy.sparse.csr_matrix(adata.X) if not keep_mt: # Filtering out MT genes adata = adata[ :, [ gene for gene in adata.var_names if not str(gene).startswith(tuple(["ERCC", "MT-", "mt-"])) ], ] # Filtering cells sc.pp.filter_cells(adata, min_genes=min_features) # Filtering features sc.pp.filter_genes(adata, min_cells=min_cells) # Normalizing total per cell if adata.X.max() < 50: print( "Warning: adata_rna.X may have already been normalized, do not normalize, please check." ) adata.layers["norm_data"] = adata.X.copy() else: sc.pp.normalize_total(adata, target_sum=target_sum) # Log1p transforming sc.pp.log1p(adata) adata.layers["norm_data"] = adata.X.copy() # Finding variable features for RNA adata if used_hvgs: if type(rna_n_top_features) == int: if rna_n_top_features > len(adata.var_names): rna_n_top_features = len(adata.var_names) if batch_key is not None: sc.pp.highly_variable_genes( adata, n_top_genes=rna_n_top_features, batch_key=batch_key ) else: sc.pp.highly_variable_genes(adata, n_top_genes=rna_n_top_features) # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) elif type(rna_n_top_features) != int: if isinstance(rna_n_top_features, str): if os.path.isfile(rna_n_top_features): rna_n_top_features = np.loadtxt(rna_n_top_features, dtype=str) adata = reindex(adata, rna_n_top_features) ## only HVGs use_highly_variable = "highly_variable" in adata.var adata_hvg = ( adata[:, adata.var["highly_variable"]].copy() if use_highly_variable else adata.copy() ) # scale data, clip values exceeding standard deviation 10. if adata_hvg.X.min() < 0: print( "Warning: adata.X may have already been scaled, do not scale, please check." ) else: sc.pp.scale(adata_hvg, max_value=10) # PCA sc.tl.pca(adata_hvg, svd_solver=svd_solver, n_comps=n_components) # 返回norm_data adata_hvg.X = ( adata_hvg.layers["norm_data"].copy() if "norm_data" in adata_hvg.layers.keys() else adata_hvg.X.copy() ) adata_hvg.obsm["feat"] = adata_hvg.obsm["X_pca"].copy() if used_pca_graph: # use scanpy functions to do the graph construction sc.pp.neighbors(adata_hvg, n_neighbors=n, metric=metric, use_rep="X_pca") return adata, adata_hvg
[docs] def preprocessing_atac( adata: AnnData, used_hvgs: bool = True, used_lsi_norm: bool = False, used_lsi_graph: bool = True, min_features: int = 100, min_cells: int = 3, atac_n_top_features=100000, # or gene list n: int = 6, batch_key: str = "batch", metric: str = "euclidean", n_components: int = 50, ): """ Preprocess scATAC data matrix. Parameters ---------- adata : AnnData object of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to peaks/regions. filter_rate : float, optional Proportion for feature selection, by default 0.01 """ if min_features is None: min_features = 100 if min_cells is None: min_cells = 3 if atac_n_top_features is None: atac_n_top_features = 10000 # Preprocessing if type(adata.X) != csr.csr_matrix: adata.X = scipy.sparse.csr_matrix(adata.X) sc.pp.filter_cells(adata, min_genes=min_features) sc.pp.filter_genes(adata, min_cells=min_cells) # adata.raw = adata ## TFIDF & LSI if used_lsi_norm: TFIDF_LSI(adata, n_comps=n_components, binarize=False, random_state=0) adata.layers["norm_data"] = adata.X.copy() else: # Normalizing total per cell if adata.X.max() < 2: print( "Warning: adata_rna.X may have already been normalized, do not normalize, please check." ) adata.layers["norm_data"] = adata.X.copy() else: sc.pp.normalize_total(adata, target_sum=1e4) # Log1p transforming sc.pp.log1p(adata) adata.layers["norm_data"] = adata.X.copy() # Finding variable features for ATAC adata if used_hvgs: if type(atac_n_top_features) == int: if atac_n_top_features > len(adata.var_names): atac_n_top_features = len(adata.var_names) if batch_key is not None: sc.pp.highly_variable_genes( adata, n_top_genes=atac_n_top_features, batch_key=batch_key ) else: sc.pp.highly_variable_genes(adata, n_top_genes=atac_n_top_features) # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) elif type(atac_n_top_features) != int: if isinstance(atac_n_top_features, str): if os.path.isfile(atac_n_top_features): atac_n_top_features = np.loadtxt(atac_n_top_features, dtype=str) adata = reindex(adata, atac_n_top_features) ## only HVGs use_highly_variable = "highly_variable" in adata.var adata_hvg = ( adata[:, adata.var["highly_variable"]].copy() if use_highly_variable else adata.copy() ) if used_lsi_graph: if not used_lsi_norm: # scale data, clip values exceeding standard deviation 10. if adata_hvg.X.min() < 0: print( "Warning: adata.X may have already been scaled, do not scale, please check." ) else: sc.pp.scale(adata_hvg, max_value=10) # PCA sc.tl.pca(adata_hvg, n_comps=n_components) # 返回norm_data adata_hvg.X = adata_hvg.layers["norm_data"].copy() adata_hvg.obsm["feat"] = adata_hvg.obsm["X_pca"].copy() # use scanpy functions to do the graph construction sc.pp.neighbors(adata_hvg, n_neighbors=n, metric=metric, use_rep="X_pca") else: # 返回norm_data adata_hvg.X = ( adata_hvg.layers["norm_data"].copy() if "norm_data" in adata_hvg.layers.keys() else adata_hvg.X.copy() ) adata_hvg.obsm["feat"] = adata_hvg.obsm["X_lsi"].copy() # use scanpy functions to do the graph construction sc.pp.neighbors(adata_hvg, n_neighbors=n, metric=metric, use_rep="X_lsi") return adata, adata_hvg
[docs] def preprocessing_adt( adata: AnnData, used_hvgs: bool = False, used_pca_graph: bool = True, rna_n_top_features=2000, # or gene list n: int = 15, batch_key: str = "batch", metric: str = "euclidean", svd_solver: str = "arpack", backed: bool = False, ): """ Preprocessing single-cell RNA-seq data Parameters ---------- adata An AnnData matrice of shape n_obs x n_vars. Rows correspond to cells and columns to genes. min_features Filtered out cells that are detected in less than n genes. Default: 600. min_cells Filtered out genes that are detected in less than n cells. Default: 3. target_sum After normalization, each cell has a total count equal to target_sum. If None, total count of each cell equal to the median of total counts for cells before normalization. rna_n_top_features Number of highly-variable genes to keep. Default: 2000. Return ------- The AnnData object after preprocessing. """ # Preprocessing # if not issparse(adata.X): if type(adata.X) != csr.csr_matrix and (not backed) and (not adata.isbacked): adata.X = scipy.sparse.csr_matrix(adata.X) # Normalizing total per cell if adata.X.max() < 50: print( "Warning: adata_adt.X may have already been normalized, do not normalize, please check." ) else: adata = clr_normalize_each_cell(adata) adata.layers["norm_data"] = adata.X.copy() # Finding variable features for RNA adata if used_hvgs: if type(rna_n_top_features) == int: if rna_n_top_features > len(adata.var_names): rna_n_top_features = len(adata.var_names) if batch_key is not None: sc.pp.highly_variable_genes( adata, n_top_genes=rna_n_top_features, batch_key=batch_key ) else: sc.pp.highly_variable_genes(adata, n_top_genes=rna_n_top_features) # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) elif type(rna_n_top_features) != int: if isinstance(rna_n_top_features, str): if os.path.isfile(rna_n_top_features): rna_n_top_features = np.loadtxt(rna_n_top_features, dtype=str) adata = reindex(adata, rna_n_top_features) ## only HVGs use_highly_variable = "highly_variable" in adata.var adata_use = ( adata[:, adata.var["highly_variable"]].copy() if use_highly_variable else adata.copy() ) if used_pca_graph: # scale data, clip values exceeding standard deviation 10. if adata_use.X.min() < 0: print( "Warning: adata.X may have already been scaled, do not scale, please check." ) else: sc.pp.scale(adata_use, max_value=10) # PCA sc.tl.pca(adata_use, svd_solver=svd_solver) # use scanpy functions to do the graph construction sc.pp.neighbors(adata_use, n_neighbors=n, metric=metric, use_rep="X_pca") # 返回norm_data if adata.X.max() < 50: pass else: adata_use.X = ( adata_use.layers["norm_data"].copy() if "norm_data" in adata_use.layers.keys() else adata_use.X.copy() ) adata_use.obsm["feat"] = adata_use.obsm["X_pca"].copy() else: sc.pp.neighbors(adata_use, n_neighbors=n, metric=metric, use_rep="X") adata_use.obsm["feat"] = adata_use.X.copy() return adata, adata_use
def save_adata_atac( atac_adata, genome, use_gene_weight, use_top_pcs, user_cache_path=None ): # 使用os.path.join来构建路径,避免平台依赖性问题 cache_path = ( os.path.join(user_cache_path, "adata_ATAC_cache.h5ad") if user_cache_path else "adata_ATAC_cache.h5ad" ) # 检查基因组参数是否有效 valid_genomes = {"hg19", "hg38", "mm9", "mm10"} if genome not in valid_genomes: raise ValueError(f"Invalid genome '{genome}'. Choose from {valid_genomes}.") # 检查是否已经存在缓存文件 if os.path.exists(cache_path): print( "Gene activity matrix has been calculated, loading cached adata_CG_atac object..." ) adata_CG_atac = sc.read_h5ad(cache_path) else: # 如果缓存文件不存在,执行耗时计算生成新的adata对象 print("Converting peak to gene activity matrix, this might take a while...") # 生成新的adata_CG_atac对象 adata_CG_atac = gene_scores( atac_adata, genome=genome, use_gene_weight=use_gene_weight, use_top_pcs=use_top_pcs, ) # 保存adata对象到缓存文件 adata_CG_atac.write_h5ad(cache_path) print(f"adata object cached at: {cache_path}") return adata_CG_atac ### preprocessing main function
[docs] def preprocessing( adata: [AnnData, MuData], profile: str = "RNA", data_type: str = "Paired", sub_data_type: str = ["rna", "atac"], batch_key: str = "batch", weight=0.8, used_hvgs: bool = True, graph_const_method: str = None, genome: str = None, use_gene_weight: bool = True, use_top_pcs: bool = False, user_cache_path: str = None, min_features: int = 600, min_cells: int = 3, target_sum: int = None, rna_n_top_features=None, # or gene list atac_n_top_features=None, # or gene list n_components: int = 50, svd_components_rna: int = 30, svd_components_atac: int = 30, cca_components: int = 20, cca_max_iter: int = 2000, randomized_svd: bool = False, filter_prop_initial: int = 0, filter_prop_refined: int = 0.3, filter_prop_propagated: int = 0, n_iters: int = 1, svd_runs: int = 1, n: int = 15, metric: str = "euclidean", svd_solver: str = "arpack", keep_mt: bool = False, backed: bool = False, verbose: bool = True, ): """ Preprocessing function for single-cell and multi-modal data. Parameters ---------- adata : [AnnData, MuData] The annotated data matrix of shape n_obs x n_vars. Rows correspond to cells and columns to genes. profile : str, optional The profile type, by default 'RNA'. data_type : str, optional The data type, by default 'Paired'. sub_data_type : list of str, optional The sub data types, by default ['rna', 'atac']. batch_key : str, optional The batch key, by default 'batch'. weight : float, optional The weight for combining adjacency matrices, by default 0.8. used_hvgs : bool, optional Whether to use highly variable genes, by default True. graph_const_method : str, optional The method for graph construction of spatial data, by default None. genome : str, optional The genome reference, by default None. use_gene_weight : bool, optional Whether to use gene weight, by default True. use_top_pcs : bool, optional Whether to use top principal components, by default False. user_cache_path: str, optional The path to save the cache file, by default None. min_features : int, optional Minimum number of features, by default 600. min_cells : int, optional Minimum number of cells, by default 3. target_sum : int, optional Target sum for normalization, by default None. rna_n_top_features : int or list, optional Number of top features for RNA, by default None. atac_n_top_features : int or list, optional Number of top features for ATAC, by default None. Return ------- The AnnData object after preprocessing. """ if profile == "RNA": return preprocessing_rna( adata, min_features=min_features, min_cells=min_cells, target_sum=target_sum, used_hvgs=used_hvgs, used_pca_graph=True, rna_n_top_features=rna_n_top_features, n_components=n_components, n=n, batch_key=batch_key, metric=metric, svd_solver=svd_solver, keep_mt=keep_mt, backed=backed, ) elif profile == "ATAC": return preprocessing_atac( adata, min_features=min_features, min_cells=min_cells, used_hvgs=used_hvgs, used_lsi_graph=True, atac_n_top_features=atac_n_top_features, n_components=n_components, n=n, batch_key=batch_key, metric=metric, ) elif profile == "ADT": return preprocessing_adt( adata, used_hvgs=used_hvgs, used_pca_graph=True, rna_n_top_features=rna_n_top_features, # or gene list n=n, batch_key=batch_key, metric=metric, svd_solver=svd_solver, backed=backed, ) elif profile == "multi-modal": # UnPaired TO DO assert data_type in [ "Paired", "UnPaired", ], 'Data_type must be "Paired", or "UnPaired".' assert sub_data_type[0] in ["rna", "atac", "adt"] and sub_data_type[1] in [ "rna", "atac", "adt", ], 'Sub_data_type must be "rna", "atac", or "adt".' if len(sub_data_type) == 2: if sub_data_type[0] == "rna" and sub_data_type[1] == "atac": rna_adata = adata.mod["rna"] atac_adata = adata.mod["atac"] adt_adata = None elif sub_data_type[0] == "rna" and sub_data_type[1] == "adt": rna_adata = adata.mod["rna"] atac_adata = None adt_adata = adata.mod["adt"] else: ValueError( 'The length of sub_data_type must be 2, such as: ["rna", "atac"] or ["rna", "adt"].' ) if data_type == "Paired": rna_adata, rna_adata_hvg = preprocessing_rna( rna_adata, min_features=min_features, min_cells=min_cells, target_sum=target_sum, used_hvgs=used_hvgs, used_pca_graph=True, rna_n_top_features=rna_n_top_features, n_components=n_components, n=n, batch_key=batch_key, metric=metric, svd_solver=svd_solver, keep_mt=keep_mt, backed=backed, ) if atac_adata is not None: atac_adata, atac_adata_hvg = preprocessing_atac( atac_adata, min_features=min_features, min_cells=min_cells, used_hvgs=used_hvgs, used_lsi_graph=True, atac_n_top_features=atac_n_top_features, n_components=n_components, n=n, batch_key=batch_key, metric=metric, ) ## Concatenating different modalities adata_paired = ad.concat([rna_adata_hvg, atac_adata_hvg], axis=1) # 假设 obsm 的每个值都是形状相同的矩阵,可以通过 np.hstack 或其他方法进行合并 rna_obsm = rna_adata_hvg.obsm atac_obsm = atac_adata_hvg.obsm # 只需要 feat key # combined_obsm = {key: np.hstack((rna_obsm[key], atac_obsm[key])) for key in rna_obsm.keys()} combined_obsm = np.hstack((rna_obsm["feat"], atac_obsm["feat"])) # 将合并后的 obsm 信息添加到 adata_paired,但先确保索引匹配 adata_paired.obsm["feat"] = pd.DataFrame( combined_obsm, index=adata_paired.obs.index # 确保索引一致 ) ## the .obs layer is empty now, and we need to repopulate it rna_cols = rna_adata.obs.columns atac_cols = atac_adata.obs.columns rnaobs = rna_adata.obs.copy() rnaobs.columns = ["rna:" + x for x in rna_cols] atacobs = atac_adata.obs.copy() atacobs.columns = ["atac:" + x for x in atac_cols] adata_paired.obs = pd.merge( rnaobs, atacobs, left_index=True, right_index=True ) ## 先将 scATAC 转换为基因活性矩阵 adata_CG_atac = save_adata_atac( atac_adata, genome=genome, use_gene_weight=use_gene_weight, use_top_pcs=use_top_pcs, user_cache_path=user_cache_path, ) ## 交集 common_genes = set(rna_adata.var_names).intersection( set(adata_CG_atac.var_names) ) print( "There are {} common genes in RNA and ATAC datasets".format( len(common_genes) ) ) rna_adata_shared = rna_adata[:, list(common_genes)].copy() atac_adata_shared = adata_CG_atac[:, list(common_genes)].copy() # 通过cell matching 构建组学间的图结构 print( "To start performing cell matching for adjacency matrix of the graph, please wait...", flush=True, ) inter_connect = create_adj( rna_adata, atac_adata, rna_adata_shared, atac_adata_shared, data_type=data_type, rna_n_top_features=rna_n_top_features, atac_n_top_features=atac_n_top_features, batch_key=batch_key, svd_components1=svd_components_rna, svd_components2=svd_components_atac, cca_components=cca_components, cca_max_iter=cca_max_iter, randomized_svd=randomized_svd, filter_prop_initial=filter_prop_initial, filter_prop_refined=filter_prop_refined, filter_prop_propagated=filter_prop_propagated, n_iters=n_iters, svd_runs=svd_runs, verbose=verbose, ) # 设置权重 # Iterate over keys in rna_obsp (assuming atac_obsp has the same keys) for key in rna_adata_hvg.obsp.keys(): # 计算加权平均的连通性矩阵 intra_connect = rna_adata_hvg.obsp[key] + atac_adata_hvg.obsp[key] adata_paired.obsp["connectivities"] = ( weight * inter_connect + (1 - weight) * intra_connect ) return adata_paired elif adt_adata is not None: adt_adata, adt_adata_hvg = preprocessing_adt( adt_adata, used_hvgs=used_hvgs, used_pca_graph=True, rna_n_top_features=rna_n_top_features, # or gene list n=n, batch_key=batch_key, metric=metric, svd_solver=svd_solver, backed=backed, ) # Here we are integrating protein and RNA data, # and most of the time there are name differences between protein (antibody) # and their corresponding gene names. correspondence = pkgutil.get_data( "Garfield", "data/gene_anno/protein_gene_conversion.csv" ) correspondence = pd.read_csv( io.BytesIO(correspondence), encoding="utf8", index_col=0 ) rna_protein_correspondence = [] for i in range(correspondence.shape[0]): curr_protein_name, curr_rna_names_human, curr_rna_names_mouse = correspondence.iloc[i] if determine_species(rna_adata) == "human": if curr_protein_name not in adt_adata_hvg.var_names: continue if ( curr_rna_names_human.find("Ignore") != -1 ): # some correspondence ignored eg. protein isoform to one gene continue curr_rna_names_human = curr_rna_names_human.split( "/" ) # eg. one protein to multiple genes for r in curr_rna_names_human: if r in rna_adata_hvg.var_names: rna_protein_correspondence.append([r, curr_protein_name]) elif determine_species(rna_adata) == "mouse": if curr_protein_name not in adt_adata_hvg.var_names: continue if ( curr_rna_names_mouse.find("Ignore") != -1 ): # some correspondence ignored eg. protein isoform to one gene continue curr_rna_names_mouse = curr_rna_names_mouse.split( "/" ) # eg. one protein to multiple genes for r in curr_rna_names_mouse: if r in rna_adata_hvg.var_names: rna_protein_correspondence.append([r, curr_protein_name]) rna_protein_correspondence = np.array(rna_protein_correspondence) ## Concatenating different modalities adata_paired = ad.concat([rna_adata_hvg, adt_adata_hvg], axis=1) # 假设 obsm 的每个值都是形状相同的矩阵,可以通过 np.hstack 或其他方法进行合并 rna_obsm = rna_adata_hvg.obsm adt_obsm = adt_adata_hvg.obsm # 只需要 feat key # combined_obsm = {key: np.hstack((rna_obsm[key], atac_obsm[key])) for key in rna_obsm.keys()} combined_obsm = np.hstack((rna_obsm["feat"], adt_obsm["feat"])) # 将合并后的 obsm 信息添加到 adata_paired,但先确保索引匹配 adata_paired.obsm["feat"] = pd.DataFrame( combined_obsm, index=adata_paired.obs.index # 确保索引一致 ) ## the .obs layer is empty now, and we need to repopulate it rna_cols = rna_adata.obs.columns adt_cols = adt_adata.obs.columns rnaobs = rna_adata.obs.copy() rnaobs.columns = ["rna:" + x for x in rna_cols] adtobs = adt_adata.obs.copy() adtobs.columns = ["adt:" + x for x in adt_cols] adata_paired.obs = pd.merge( rnaobs, adtobs, left_index=True, right_index=True ) ## 交集 # 如果 adt_adata.var_names 全为大写,则直接取交集 if all([x.isupper() for x in adt_adata.var_names]): common_genes = set(rna_adata.var_names).intersection( set(adt_adata.var_names) ) print( "There are {} common genes in RNA and ADT datasets".format( len(common_genes) ) ) rna_adata_shared = rna_adata[:, list(common_genes)].copy() adt_adata_shared = adt_adata[:, list(common_genes)].copy() else: print( "There are {} common genes in RNA and ADT datasets".format( rna_protein_correspondence.shape[0] ) ) rna_adata_shared = rna_adata[:, rna_protein_correspondence[:, 0]].copy() adt_adata_shared = adt_adata[:, rna_protein_correspondence[:, 1]].copy() ## 通过cell matching 构建组学间的图结构 print( "To start performing cell matching for adjacency matrix of the graph, please wait...", flush=True, ) inter_connect = create_adj( rna_adata, adt_adata, rna_adata_shared, adt_adata_shared, data_type=data_type, rna_n_top_features=rna_n_top_features, atac_n_top_features=atac_n_top_features, batch_key=batch_key, svd_components1=svd_components_rna, svd_components2=svd_components_atac, cca_components=cca_components, cca_max_iter=cca_max_iter, randomized_svd=randomized_svd, filter_prop_initial=filter_prop_initial, filter_prop_refined=filter_prop_refined, filter_prop_propagated=filter_prop_propagated, n_iters=n_iters, svd_runs=svd_runs, verbose=verbose, ) # Iterate over keys in rna_obsp (assuming adt_obsp has the same keys) for key in rna_adata_hvg.obsp.keys(): # 计算加权平均的连通性矩阵 intra_connect = rna_adata_hvg.obsp[key] + adt_adata_hvg.obsp[key] adata_paired.obsp["connectivities"] = ( weight * inter_connect + (1 - weight) * intra_connect ) return adata_paired elif profile == "spatial": assert data_type in [ "single-modal", "multi-modal", ], 'Data_type must be "single-modal", or "multi-modal".' if data_type == "single-modal": _, rna_adata_hvg = preprocessing_rna( adata, min_features=min_features, min_cells=min_cells, target_sum=target_sum, used_hvgs=used_hvgs, used_pca_graph=True, rna_n_top_features=rna_n_top_features, n_components=n_components, n=n, batch_key=batch_key, metric=metric, svd_solver=svd_solver, keep_mt=True, backed=backed, ) # Construct Spatial Graph if graph_const_method == "mu_std": spatial_adj = graph_construction( adata, mode="mu_std", k=n, batch_key=batch_key ) # spatial_adj = graph_dict #.toarray() elif graph_const_method == "Radius": spatial_adj = graph_construction( adata, mode="Radius", k=n, batch_key=batch_key ) elif graph_const_method == "KNN": spatial_adj = graph_construction( adata, mode="KNN", k=n, batch_key=batch_key ) elif graph_const_method == "Squidpy": import squidpy as sq sq.gr.spatial_neighbors(adata, coord_type="generic", n_neighs=n) # Make adjacency matrix symmetric adata.obsp["spatial_connectivities"] = adata.obsp[ "spatial_connectivities" ].maximum(adata.obsp["spatial_connectivities"].T) spatial_adj = adata.obsp["spatial_connectivities"] # .toarray() # Ensure adj is a csr_matrix if not isinstance(spatial_adj, csr_matrix): spatial_adj = csr_matrix(spatial_adj) expr_adj = rna_adata_hvg.obsp["connectivities"] # Ensure adj is a csr_matrix if not isinstance(expr_adj, csr_matrix): expr_adj = csr_matrix(expr_adj) # Validate adjacency matrix symmetry if spatial_adj.getnnz() != spatial_adj.T.getnnz(): raise ImportError( "The spatial_adj adjacency matrix has to be symmetric." ) # Validate adjacency matrix symmetry if expr_adj.getnnz() != expr_adj.T.getnnz(): raise ImportError("The expr_adj adjacency matrix has to be symmetric.") # 计算加权平均的连通性矩阵 rna_adata_hvg.obsp["spatial_connectivities"] = spatial_adj rna_adata_hvg.obsp["connectivities"] = ( weight * spatial_adj + (1 - weight) * expr_adj ) return rna_adata_hvg elif data_type == "multi-modal": assert sub_data_type[0] in ["rna", "atac", "adt"] and sub_data_type[1] in [ "rna", "atac", "adt", ], 'Sub_data_type must be "rna", "atac", or "adt".' if len(sub_data_type) == 2: if sub_data_type[0] == "rna" and sub_data_type[1] == "atac": rna_adata = adata.mod["rna"] atac_adata = adata.mod["atac"] adt_adata = None elif sub_data_type[0] == "rna" and sub_data_type[1] == "adt": rna_adata = adata.mod["rna"] atac_adata = None adt_adata = adata.mod["adt"] else: ValueError( 'The length of sub_data_type must be 2, such as: ["rna", "atac"] or ["rna", "adt"].' ) rna_adata, rna_adata_hvg = preprocessing_rna( rna_adata, min_features=min_features, min_cells=min_cells, target_sum=target_sum, used_hvgs=used_hvgs, used_pca_graph=True, rna_n_top_features=rna_n_top_features, n_components=n_components, n=n, batch_key=batch_key, metric=metric, svd_solver=svd_solver, keep_mt=True, backed=backed, ) if atac_adata is not None: atac_adata, atac_adata_hvg = preprocessing_atac( atac_adata, min_features=min_features, min_cells=min_cells, used_hvgs=used_hvgs, used_lsi_graph=True, atac_n_top_features=atac_n_top_features, n_components=n_components, n=n, batch_key=batch_key, metric=metric, ) ## Concatenating different modalities adata_paired = ad.concat([rna_adata_hvg, atac_adata_hvg], axis=1) # 假设 obsm 的每个值都是形状相同的矩阵,可以通过 np.hstack 或其他方法进行合并 rna_obsm = rna_adata_hvg.obsm atac_obsm = atac_adata_hvg.obsm # 只需要 feat key # combined_obsm = {key: np.hstack((rna_obsm[key], atac_obsm[key])) for key in rna_obsm.keys()} combined_obsm = np.hstack((rna_obsm["feat"], atac_obsm["feat"])) # 将合并后的 obsm 信息添加到 adata_paired,但先确保索引匹配 adata_paired.obsm["feat"] = pd.DataFrame( combined_obsm, index=adata_paired.obs.index # 确保索引一致 ) adata_paired.obsm["spatial"] = rna_adata.obsm["spatial"] ## the .obs layer is empty now, and we need to repopulate it rna_cols = rna_adata.obs.columns atac_cols = atac_adata.obs.columns rnaobs = rna_adata.obs.copy() rnaobs.columns = ["rna:" + x for x in rna_cols] atacobs = atac_adata.obs.copy() atacobs.columns = ["atac:" + x for x in atac_cols] adata_paired.obs = pd.merge( rnaobs, atacobs, left_index=True, right_index=True ) ## 先将 scATAC 转换为基因活性矩阵 if atac_adata.var_names[0].startswith('chr'): adata_CG_atac = save_adata_atac( atac_adata, genome=genome, use_gene_weight=use_gene_weight, use_top_pcs=use_top_pcs, user_cache_path=user_cache_path, ) ## 交集 common_genes = set(rna_adata.var_names).intersection( set(adata_CG_atac.var_names) ) print( "There are {} common genes in RNA and ATAC datasets".format( len(common_genes) ) ) rna_adata_shared = rna_adata[:, list(common_genes)].copy() atac_adata_shared = adata_CG_atac[:, list(common_genes)].copy() else: ## 交集 common_genes = set(rna_adata.var_names).intersection( set(atac_adata.var_names) ) print( "There are {} common genes in RNA and ATAC datasets".format( len(common_genes) ) ) rna_adata_shared = rna_adata[:, list(common_genes)].copy() atac_adata_shared = atac_adata[:, list(common_genes)].copy() # 通过cell matching 构建组学间的图结构 print( "To start performing cell matching for adjacency matrix of the graph, please wait...", flush=True, ) inter_connect = create_adj( rna_adata, atac_adata, rna_adata_shared, atac_adata_shared, data_type=data_type, rna_n_top_features=rna_n_top_features, atac_n_top_features=atac_n_top_features, batch_key=batch_key, svd_components1=svd_components_rna, svd_components2=svd_components_atac, cca_components=cca_components, cca_max_iter=cca_max_iter, randomized_svd=randomized_svd, filter_prop_initial=filter_prop_initial, filter_prop_refined=filter_prop_refined, filter_prop_propagated=filter_prop_propagated, n_iters=n_iters, svd_runs=svd_runs, verbose=verbose, ) # Construct Spatial Graph if graph_const_method == "mu_std": spatial_adj = graph_construction( rna_adata_hvg, mode="mu_std", k=n, batch_key=batch_key ) elif graph_const_method == "Radius": spatial_adj = graph_construction( adata, mode="Radius", k=n, batch_key=batch_key ) elif graph_const_method == "KNN": spatial_adj = graph_construction( adata, mode="KNN", k=n, batch_key=batch_key ) elif graph_const_method == "Squidpy": import squidpy as sq sq.gr.spatial_neighbors( rna_adata_hvg, coord_type="generic", n_neighs=n ) spatial_adj = rna_adata_hvg.obsp[ "spatial_connectivities" ] # .toarray() # Ensure adj is a csr_matrix if not isinstance(spatial_adj, csr_matrix): spatial_adj = csr_matrix(spatial_adj) if not isinstance(inter_connect, csr_matrix): inter_connect = csr_matrix(inter_connect) # Iterate over keys in rna_obsp (assuming adt_obsp has the same keys) # for key in rna_adata_hvg.obsp.keys(): # # 计算加权平均的连通性矩阵 # intra_connect = rna_adata_hvg.obsp[key] + atac_adata_hvg.obsp[key] # 计算加权平均的连通性矩阵 adata_paired.obsp["spatial_connectivities"] = spatial_adj adata_paired.obsp["connectivities"] = ( weight * spatial_adj + (1 - weight) * inter_connect ) return adata_paired elif adt_adata is not None: adt_adata, adt_adata_hvg = preprocessing_adt( adt_adata, used_hvgs=used_hvgs, used_pca_graph=True, rna_n_top_features=rna_n_top_features, # or gene list n=n, batch_key=batch_key, metric=metric, svd_solver=svd_solver, backed=backed, ) correspondence = pkgutil.get_data( "Garfield", "data/gene_anno/protein_gene_conversion.csv" ) correspondence = pd.read_csv( io.BytesIO(correspondence), encoding="utf8", index_col=0 ) rna_protein_correspondence = [] for i in range(correspondence.shape[0]): curr_protein_name, curr_rna_names_human, curr_rna_names_mouse = correspondence.iloc[i] if determine_species(rna_adata) == "human": if curr_protein_name not in adt_adata_hvg.var_names: continue if ( curr_rna_names_human.find("Ignore") != -1 ): # some correspondence ignored eg. protein isoform to one gene continue curr_rna_names_human = curr_rna_names_human.split( "/" ) # eg. one protein to multiple genes for r in curr_rna_names_human: if r in rna_adata_hvg.var_names: rna_protein_correspondence.append([r, curr_protein_name]) elif determine_species(rna_adata) == "mouse": if curr_protein_name not in adt_adata_hvg.var_names: continue if ( curr_rna_names_mouse.find("Ignore") != -1 ): # some correspondence ignored eg. protein isoform to one gene continue curr_rna_names_mouse = curr_rna_names_mouse.split( "/" ) # eg. one protein to multiple genes for r in curr_rna_names_mouse: if r in rna_adata_hvg.var_names: rna_protein_correspondence.append([r, curr_protein_name]) rna_protein_correspondence = np.array(rna_protein_correspondence) ## Concatenating different modalities adata_paired = ad.concat([rna_adata_hvg, adt_adata_hvg], axis=1) # 假设 obsm 的每个值都是形状相同的矩阵,可以通过 np.hstack 或其他方法进行合并 rna_obsm = rna_adata_hvg.obsm adt_obsm = adt_adata_hvg.obsm # 只需要 feat key # combined_obsm = {key: np.hstack((rna_obsm[key], atac_obsm[key])) for key in rna_obsm.keys()} combined_obsm = np.hstack((rna_obsm["feat"], adt_obsm["feat"])) # 将合并后的 obsm 信息添加到 adata_paired,但先确保索引匹配 adata_paired.obsm["feat"] = pd.DataFrame( combined_obsm, index=adata_paired.obs.index # 确保索引一致 ) adata_paired.obsm["spatial"] = rna_adata.obsm["spatial"] ## the .obs layer is empty now, and we need to repopulate it rna_cols = rna_adata.obs.columns adt_cols = adt_adata.obs.columns rnaobs = rna_adata.obs.copy() rnaobs.columns = ["rna:" + x for x in rna_cols] adtobs = adt_adata.obs.copy() adtobs.columns = ["adt:" + x for x in adt_cols] adata_paired.obs = pd.merge( rnaobs, adtobs, left_index=True, right_index=True ) ## 交集 # 如果 adt_adata.var_names 全为大写,则直接取交集 if all([x.isupper() for x in adt_adata.var_names]): common_genes = set(rna_adata.var_names).intersection( set(adt_adata.var_names) ) print( "There are {} common genes in RNA and ADT datasets".format( len(common_genes) ) ) rna_adata_shared = rna_adata[:, list(common_genes)].copy() adt_adata_shared = adt_adata[:, list(common_genes)].copy() else: print( "There are {} common genes in RNA and ADT datasets".format( rna_protein_correspondence.shape[0] ) ) rna_adata_shared = rna_adata[:, rna_protein_correspondence[:, 0]].copy() adt_adata_shared = adt_adata[:, rna_protein_correspondence[:, 1]].copy() ## 通过cell matching 构建组学间的图结构 print( "To start performing cell matching for adjacency matrix of the graph, please wait...", flush=True, ) inter_connect = create_adj( rna_adata_hvg, adt_adata, rna_adata_shared, adt_adata_shared, data_type=data_type, rna_n_top_features=rna_n_top_features, atac_n_top_features=atac_n_top_features, batch_key=batch_key, svd_components1=svd_components_rna, svd_components2=svd_components_atac, cca_components=cca_components, cca_max_iter=cca_max_iter, randomized_svd=randomized_svd, filter_prop_initial=filter_prop_initial, filter_prop_refined=filter_prop_refined, filter_prop_propagated=filter_prop_propagated, n_iters=n_iters, svd_runs=svd_runs, verbose=verbose, ) # Construct Spatial Graph if graph_const_method == "mu_std": spatial_adj = graph_construction( rna_adata_hvg, mode="mu_std", k=n, batch_key=batch_key ) elif graph_const_method == "Radius": spatial_adj = graph_construction( adata, mode="Radius", k=n, batch_key=batch_key ) elif graph_const_method == "KNN": spatial_adj = graph_construction( adata, mode="KNN", k=n, batch_key=batch_key ) elif graph_const_method == "Squidpy": import squidpy as sq sq.gr.spatial_neighbors( rna_adata_hvg, coord_type="generic", n_neighs=n ) spatial_adj = rna_adata_hvg.obsp[ "spatial_connectivities" ] # .toarray() # Ensure adj is a csr_matrix if not isinstance(spatial_adj, csr_matrix): spatial_adj = csr_matrix(spatial_adj) if not isinstance(inter_connect, csr_matrix): inter_connect = csr_matrix(inter_connect) # 计算加权平均的连通性矩阵 adata_paired.obsp["spatial_connectivities"] = spatial_adj adata_paired.obsp["connectivities"] = ( weight * spatial_adj + (1 - weight) * inter_connect ) return adata_paired else: raise ValueError("Not support profile: `{}` yet".format(profile))