Source code for Garfield.preprocessing.preprocess

from anndata import AnnData
from mudata import MuData

# read data
from ..data.datareaders import concat_data

# preprocessing
from ..preprocessing.preprocess_utils import preprocessing


[docs] def DataProcess( adata_list, profile, data_type=None, sub_data_type=None, sample_col="batch", genome=None, weight=None, graph_const_method=None, use_gene_weight=True, user_cache_path=None, use_top_pcs=False, used_hvg=True, min_features=100, min_cells=3, keep_mt=False, target_sum=1e4, rna_n_top_features=3000, atac_n_top_features=10000, n_components=50, n_neighbors=15, metric="correlation", svd_solver="arpack", ): """ Processes single or multi-modal data (e.g., RNA, ATAC, ADT, spatial) with optional preprocessing steps such as normalization, feature selection, and dimensionality reduction. Parameters ---------- adata_list : list of AnnData or MuData objects List of AnnData or MuData objects to be concatenated and processed. profile : str Data profile type, e.g., 'RNA', 'ATAC', 'ADT', 'multi-modal', or 'spatial'. data_type : str, optional Type of data being processed, e.g., 'single-cell', 'bulk'. Default is None. sub_data_type : list[str], optional List of sub-data types for multi-modal data, e.g., ['rna', 'atac'] or ['rna', 'adt']. Default is None. sample_col : str, optional Column in the dataset used to indicate batch or sample groupings. Default is 'batch'. genome : str, optional Reference genome for the dataset. Default is None. weight : float or None, optional Weight for certain data processing steps, such as graph construction. Default is None. graph_const_method : str, optional Method for constructing the graph if applicable, e.g., 'knn'. Default is None. use_gene_weight : bool, optional Whether to use gene weights in the preprocessing steps. Default is True. user_cache_path : str, optional Path to the user's cache directory. Default is None. use_top_pcs : bool, optional Whether to use the top principal components during dimensionality reduction. Default is False. used_hvg : bool, optional Whether to use highly variable genes (HVG) for the analysis. Default is True. min_features : int, optional Minimum number of features required for a cell to be included. Default is 100. min_cells : int, optional Minimum number of cells required for a feature to be included. Default is 3. keep_mt : bool, optional Whether to keep mitochondrial genes in the dataset. Default is False. target_sum : float, optional Target sum for normalization. Default is 1e4. rna_n_top_features : int, optional Number of top features to keep for RNA data. Default is 3000. atac_n_top_features : int, optional Number of top features to keep for ATAC data. Default is 10000. n_components : int, optional Number of components for dimensionality reduction (e.g., PCA). Default is 50. n_neighbors : int, optional Number of neighbors for graph-based algorithms. Default is 15. metric : str, optional Distance metric to use in graph construction. Default is 'correlation'. svd_solver : str, optional Solver to use for singular value decomposition (SVD). Default is 'arpack'. Returns ---------- AnnData or MuData Preprocessed single or multi-modal data based on the specified profile and sub_data_type. """ # 如果传入的adata_list中的obsm存在garfield_latent,则直接返回 # 如果adata_list不为list,则则为一个元素的list adata_list_new = adata_list if isinstance(adata_list, list) else [adata_list] if any("garfield_latent" in adata.obsm and adata.obsm["garfield_latent"].size > 0 for adata in adata_list_new): return adata_list else: # load data adata = concat_data( adata_list, batch_categories=None, join="inner", batch_key=sample_col, # 'batch' index_unique=None, save=None, ) if isinstance(adata, AnnData): if adata.X.max() < 50: print( "Warning: adata.X may have already been normalized, adata.X must be `counts`, please check." ) else: adata.layers["counts"] = adata.X.copy() elif isinstance(adata, MuData): if adata.mod["rna"].X.max() < 50: print( "Warning: adata.X may have already been normalized, adata.X must be `counts`, please check." ) else: adata.mod["rna"].layers["counts"] = adata.mod["rna"].X.copy() # RNA ATAC ADT if profile in ["RNA", "ATAC", "ADT"]: ## 预处理 _, adata_hvg = preprocessing( adata, profile=profile, data_type=data_type, genome=genome, use_gene_weight=use_gene_weight, use_top_pcs=use_top_pcs, used_hvgs=used_hvg, min_features=min_features, min_cells=min_cells, target_sum=target_sum, rna_n_top_features=rna_n_top_features, atac_n_top_features=atac_n_top_features, n_components=n_components, n=n_neighbors, batch_key=sample_col, metric=metric, svd_solver=svd_solver, keep_mt=keep_mt, ) return adata_hvg ### Paired multi-modal elif profile == "multi-modal": if len(sub_data_type) == 2: if sub_data_type[0] == "rna" and sub_data_type[1] == "atac": rna_adata = adata.mod["rna"].copy() atac_adata = adata.mod["atac"].copy() mdata = MuData({"rna": rna_adata, "atac": atac_adata}) elif sub_data_type[0] == "rna" and sub_data_type[1] == "adt": rna_adata = adata.mod["rna"].copy() adt_adata = adata.mod["adt"].copy() mdata = MuData({"rna": rna_adata, "adt": adt_adata}) else: ValueError( 'The length of sub_data_type must be 2, such as: ["rna", "atac"] or ["rna", "adt"].' ) del adata ## 预处理 merged_adata = preprocessing( mdata, profile=profile, data_type=data_type, sub_data_type=sub_data_type, genome=genome, weight=weight, use_gene_weight=use_gene_weight, use_top_pcs=use_top_pcs, user_cache_path=user_cache_path, used_hvgs=used_hvg, min_features=min_features, min_cells=min_cells, target_sum=target_sum, rna_n_top_features=rna_n_top_features, atac_n_top_features=atac_n_top_features, n_components=n_components, n=n_neighbors, batch_key=sample_col, metric=metric, svd_solver=svd_solver, keep_mt=keep_mt, ) return merged_adata ### spatial single- or multi-modal elif profile == "spatial": ## 预处理 merged_adata = preprocessing( adata, profile=profile, data_type=data_type, sub_data_type=sub_data_type, genome=genome, weight=weight, graph_const_method=graph_const_method, use_gene_weight=use_gene_weight, use_top_pcs=use_top_pcs, user_cache_path=user_cache_path, used_hvgs=used_hvg, min_features=min_features, min_cells=min_cells, target_sum=target_sum, rna_n_top_features=rna_n_top_features, atac_n_top_features=atac_n_top_features, n_components=n_components, n=n_neighbors, batch_key=sample_col, metric=metric, svd_solver=svd_solver, keep_mt=keep_mt, ) return merged_adata else: return "Unknown input data type."