from anndata import AnnData
from mudata import MuData
# read data
from ..data.datareaders import concat_data
# preprocessing
from ..preprocessing.preprocess_utils import preprocessing
[docs]
def DataProcess(
adata_list,
profile,
data_type=None,
sub_data_type=None,
sample_col="batch",
genome=None,
weight=None,
graph_const_method=None,
use_gene_weight=True,
user_cache_path=None,
use_top_pcs=False,
used_hvg=True,
min_features=100,
min_cells=3,
keep_mt=False,
target_sum=1e4,
rna_n_top_features=3000,
atac_n_top_features=10000,
n_components=50,
n_neighbors=15,
metric="correlation",
svd_solver="arpack",
):
"""
Processes single or multi-modal data (e.g., RNA, ATAC, ADT, spatial) with optional preprocessing steps
such as normalization, feature selection, and dimensionality reduction.
Parameters
----------
adata_list : list of AnnData or MuData objects
List of AnnData or MuData objects to be concatenated and processed.
profile : str
Data profile type, e.g., 'RNA', 'ATAC', 'ADT', 'multi-modal', or 'spatial'.
data_type : str, optional
Type of data being processed, e.g., 'single-cell', 'bulk'. Default is None.
sub_data_type : list[str], optional
List of sub-data types for multi-modal data, e.g., ['rna', 'atac'] or ['rna', 'adt']. Default is None.
sample_col : str, optional
Column in the dataset used to indicate batch or sample groupings. Default is 'batch'.
genome : str, optional
Reference genome for the dataset. Default is None.
weight : float or None, optional
Weight for certain data processing steps, such as graph construction. Default is None.
graph_const_method : str, optional
Method for constructing the graph if applicable, e.g., 'knn'. Default is None.
use_gene_weight : bool, optional
Whether to use gene weights in the preprocessing steps. Default is True.
user_cache_path : str, optional
Path to the user's cache directory. Default is None.
use_top_pcs : bool, optional
Whether to use the top principal components during dimensionality reduction. Default is False.
used_hvg : bool, optional
Whether to use highly variable genes (HVG) for the analysis. Default is True.
min_features : int, optional
Minimum number of features required for a cell to be included. Default is 100.
min_cells : int, optional
Minimum number of cells required for a feature to be included. Default is 3.
keep_mt : bool, optional
Whether to keep mitochondrial genes in the dataset. Default is False.
target_sum : float, optional
Target sum for normalization. Default is 1e4.
rna_n_top_features : int, optional
Number of top features to keep for RNA data. Default is 3000.
atac_n_top_features : int, optional
Number of top features to keep for ATAC data. Default is 10000.
n_components : int, optional
Number of components for dimensionality reduction (e.g., PCA). Default is 50.
n_neighbors : int, optional
Number of neighbors for graph-based algorithms. Default is 15.
metric : str, optional
Distance metric to use in graph construction. Default is 'correlation'.
svd_solver : str, optional
Solver to use for singular value decomposition (SVD). Default is 'arpack'.
Returns
----------
AnnData or MuData
Preprocessed single or multi-modal data based on the specified profile and sub_data_type.
"""
# 如果传入的adata_list中的obsm存在garfield_latent,则直接返回
# 如果adata_list不为list,则则为一个元素的list
adata_list_new = adata_list if isinstance(adata_list, list) else [adata_list]
if any("garfield_latent" in adata.obsm and adata.obsm["garfield_latent"].size > 0 for adata in adata_list_new):
return adata_list
else:
# load data
adata = concat_data(
adata_list,
batch_categories=None,
join="inner",
batch_key=sample_col, # 'batch'
index_unique=None,
save=None,
)
if isinstance(adata, AnnData):
if adata.X.max() < 50:
print(
"Warning: adata.X may have already been normalized, adata.X must be `counts`, please check."
)
else:
adata.layers["counts"] = adata.X.copy()
elif isinstance(adata, MuData):
if adata.mod["rna"].X.max() < 50:
print(
"Warning: adata.X may have already been normalized, adata.X must be `counts`, please check."
)
else:
adata.mod["rna"].layers["counts"] = adata.mod["rna"].X.copy()
# RNA ATAC ADT
if profile in ["RNA", "ATAC", "ADT"]:
## 预处理
_, adata_hvg = preprocessing(
adata,
profile=profile,
data_type=data_type,
genome=genome,
use_gene_weight=use_gene_weight,
use_top_pcs=use_top_pcs,
used_hvgs=used_hvg,
min_features=min_features,
min_cells=min_cells,
target_sum=target_sum,
rna_n_top_features=rna_n_top_features,
atac_n_top_features=atac_n_top_features,
n_components=n_components,
n=n_neighbors,
batch_key=sample_col,
metric=metric,
svd_solver=svd_solver,
keep_mt=keep_mt,
)
return adata_hvg
### Paired multi-modal
elif profile == "multi-modal":
if len(sub_data_type) == 2:
if sub_data_type[0] == "rna" and sub_data_type[1] == "atac":
rna_adata = adata.mod["rna"].copy()
atac_adata = adata.mod["atac"].copy()
mdata = MuData({"rna": rna_adata, "atac": atac_adata})
elif sub_data_type[0] == "rna" and sub_data_type[1] == "adt":
rna_adata = adata.mod["rna"].copy()
adt_adata = adata.mod["adt"].copy()
mdata = MuData({"rna": rna_adata, "adt": adt_adata})
else:
ValueError(
'The length of sub_data_type must be 2, such as: ["rna", "atac"] or ["rna", "adt"].'
)
del adata
## 预处理
merged_adata = preprocessing(
mdata,
profile=profile,
data_type=data_type,
sub_data_type=sub_data_type,
genome=genome,
weight=weight,
use_gene_weight=use_gene_weight,
use_top_pcs=use_top_pcs,
user_cache_path=user_cache_path,
used_hvgs=used_hvg,
min_features=min_features,
min_cells=min_cells,
target_sum=target_sum,
rna_n_top_features=rna_n_top_features,
atac_n_top_features=atac_n_top_features,
n_components=n_components,
n=n_neighbors,
batch_key=sample_col,
metric=metric,
svd_solver=svd_solver,
keep_mt=keep_mt,
)
return merged_adata
### spatial single- or multi-modal
elif profile == "spatial":
## 预处理
merged_adata = preprocessing(
adata,
profile=profile,
data_type=data_type,
sub_data_type=sub_data_type,
genome=genome,
weight=weight,
graph_const_method=graph_const_method,
use_gene_weight=use_gene_weight,
use_top_pcs=use_top_pcs,
user_cache_path=user_cache_path,
used_hvgs=used_hvg,
min_features=min_features,
min_cells=min_cells,
target_sum=target_sum,
rna_n_top_features=rna_n_top_features,
atac_n_top_features=atac_n_top_features,
n_components=n_components,
n=n_neighbors,
batch_key=sample_col,
metric=metric,
svd_solver=svd_solver,
keep_mt=keep_mt,
)
return merged_adata
else:
return "Unknown input data type."