Source code for Garfield.preprocessing._graph

"""
Functions for graph construction
"""
import numpy as np
import warnings
import scanpy as sc
import scipy.sparse as sp
import igraph as ig
import leidenalg
import pynndescent

from . import _utils as utils


[docs] def get_nearest_neighbors( query_arr, target_arr, svd_components=None, randomized_svd=False, svd_runs=1, metric="correlation", ): """ For each row in query_arr, compute its nearest neighbor in target_arr. Parameters ---------- query_arr: np.array of shape (n_samples1, n_features) The query data matrix. target_arr: np.array of shape (n_samples2, n_features) The target data matrix. svd_components: None or int, default=None If not None, will first conduct SVD to reduce the dimension of the vertically stacked version of query_arr and target_arr. randomized_svd: bool, default=False Whether to use randomized SVD. svd_runs: int, default=1 Run multiple instances of SVD and select the one with the lowest Frobenious reconstruction error. metric: string, default='correlation' The metric to use in nearest neighbor search. Returns ------- neighbors: np.array of shape (n_samples1) The i-th element is the index in target_arr to whom the i-th row of query_arr is closest to. dists: np.array of shape (n_samples1) The i-th element is the distance corresponding to neighbors[i]. """ query_arr = utils.convert_to_numpy(query_arr) target_arr = utils.convert_to_numpy(target_arr) arr = np.vstack([query_arr, target_arr]) arr = utils.svd_embedding( arr=arr, n_components=svd_components, randomized=randomized_svd, n_runs=svd_runs ) query_arr = arr[: query_arr.shape[0], :] pivot_arr = arr[query_arr.shape[0] :, :] # approximate nearest neighbor search index = pynndescent.NNDescent(pivot_arr, n_neighbors=100, metric=metric) neighbors, dists = index.query(query_arr, k=50) neighbors, dists = neighbors[:, 0], dists[:, 0] return neighbors, dists