From 38d45ff63ff4cda9d90f0ce797f9efe46a03bd3b Mon Sep 17 00:00:00 2001
From: Dominik <dotto@fredhutch.org>
Date: Fri, 15 Sep 2023 15:06:30 -0700
Subject: [PATCH 1/4] Refactor run_diffusion_maps to Modularize Kernel
 Computation and Eigen Decomposition

This commit separates the functionality of run_diffusion_maps into two
distinct sub-functions: compute_kernel handles the adaptive anisotropic
kernel calculation, while diffusion_maps_from_kernel performs the eigen
decomposition for the diffusion maps.
---
 README.md             |   5 +-
 src/palantir/utils.py | 174 ++++++++++++++++++++++++++++--------------
 2 files changed, 118 insertions(+), 61 deletions(-)

diff --git a/README.md b/README.md
index eb00f5b4..13e1a870 100755
--- a/README.md
+++ b/README.md
@@ -107,12 +107,11 @@ ____
 
 Release Notes
 -------------
- ### Version 1.3.2
+ ### Version 1.3.1
  * implemented `palantir.plot.plot_stats` to plot arbitray cell-wise statistics as x-/y-positions.
  * reduce memory usgae of `palantir.presults.compute_gene_trends`
- 
- ### Version 1.3.1
  * removed seaborn dependency
+ * refactor `run_diffusion_maps` to split out `compute_kernel` and `diffusion_maps_from_kernel`
 
  ### Version 1.3.0
 
diff --git a/src/palantir/utils.py b/src/palantir/utils.py
index 685d8eda..e306c196 100644
--- a/src/palantir/utils.py
+++ b/src/palantir/utils.py
@@ -293,6 +293,113 @@ def run_density_evaluation(
     return log_density
 
 
+def compute_kernel(
+    data: Union[pd.DataFrame, sc.AnnData],
+    knn: int = 30,
+    alpha: float = 0,
+    pca_key: str = "X_pca",
+    kernel_key: str = "DM_Kernel",
+) -> csr_matrix:
+    """
+    Compute the adaptive anisotropic diffusion kernel.
+
+    Parameters
+    ----------
+    data : Union[pd.DataFrame, sc.AnnData]
+        Data points (rows) in a feature space (columns) for pd.DataFrame.
+        For sc.AnnData, it uses the .X attribute.
+    knn : int
+        Number of nearest neighbors for adaptive kernel calculation. Default is 30.
+    alpha : float
+        Normalization parameter for the diffusion operator. Default is 0.
+    pca_key : str, optional
+        Key to retrieve PCA projections from data if it is a sc.AnnData object. Default is 'X_pca'.
+    kernel_key : str, optional
+        Key to store the kernel in obsp of data if it is a sc.AnnData object. Default is 'DM_Kernel'.
+
+    Returns
+    -------
+    csr_matrix
+        Computed kernel matrix.
+    """
+
+    # If the input is sc.AnnData, convert it to a DataFrame
+    if isinstance(data, sc.AnnData):
+        data_df = pd.DataFrame(data.obsm[pca_key], index=data.obs_names)
+    else:
+        data_df = data
+
+    N = data_df.shape[0]
+    temp = sc.AnnData(data_df.values)
+    sc.pp.neighbors(temp, n_pcs=0, n_neighbors=knn)
+    kNN = temp.obsp["distances"]
+
+    adaptive_k = int(np.floor(knn / 3))
+    adaptive_std = np.zeros(N)
+    for i in np.arange(N):
+        adaptive_std[i] = np.sort(kNN.data[kNN.indptr[i] : kNN.indptr[i + 1]])[
+            adaptive_k - 1
+        ]
+
+    x, y, dists = find(kNN)
+    dists /= adaptive_std[x]
+    W = csr_matrix((np.exp(-dists), (x, y)), shape=[N, N])
+
+    kernel = W + W.T
+
+    if alpha > 0:
+        D = np.ravel(kernel.sum(axis=1))
+        D[D != 0] = D[D != 0] ** (-alpha)
+        mat = csr_matrix((D, (range(N), range(N))), shape=[N, N])
+        kernel = mat.dot(kernel).dot(mat)
+
+    if isinstance(data, sc.AnnData):
+        data.obsp[kernel_key] = kernel
+
+    return kernel
+
+
+def diffusion_maps_from_kernel(
+    kernel: csr_matrix, n_components: int = 10, seed: Union[int, None] = 0
+):
+    """
+    Compute the diffusion map given a kernel matrix.
+
+    Parameters
+    ----------
+    kernel : csr_matrix
+        Precomputed kernel matrix.
+    n_components : int
+        Number of diffusion components to compute. Default is 10.
+    seed : Union[int, None]
+        Seed for random initialization. Default is 0.
+
+    Returns
+    -------
+    dict
+        T-matrix (T), Diffusion components (EigenVectors) and corresponding eigenvalues (EigenValues).
+    """
+    N = kernel.shape[0]
+    D = np.ravel(kernel.sum(axis=1))
+    D[D != 0] = 1 / D[D != 0]
+    T = csr_matrix((D, (range(N), range(N))), shape=[N, N]).dot(kernel)
+
+    np.random.seed(seed)
+    v0 = np.random.rand(min(T.shape))
+    D, V = eigs(T, n_components, tol=1e-4, maxiter=1000, v0=v0)
+
+    D = np.real(D)
+    V = np.real(V)
+    inds = np.argsort(D)[::-1]
+    D = D[inds]
+    V = V[:, inds]
+
+    for i in range(V.shape[1]):
+        V[:, i] = V[:, i] / np.linalg.norm(V[:, i])
+
+    return {"T": T, "EigenVectors": pd.DataFrame(V), "EigenValues": pd.Series(D)}
+
+
 def run_diffusion_maps(
     data: Union[pd.DataFrame, sc.AnnData],
     n_components: int = 10,
@@ -347,70 +454,21 @@ def run_diffusion_maps(
     else:
         data_df = data
 
-    if not isinstance(data_df, pd.DataFrame):
-        raise ValueError("'data_df' should be a pd.DataFrame or a sc.AnnData instance")
+    if not isinstance(data_df, pd.DataFrame) and not issparse(data_df):
+        raise ValueError("'data_df' should be a pd.DataFrame or sc.AnnData")
 
-    # Determine the kernel
-    N = data_df.shape[0]
     if not issparse(data_df):
-        print("Determing nearest neighbor graph...")
-        temp = sc.AnnData(data_df.values)
-        sc.pp.neighbors(temp, n_pcs=0, n_neighbors=knn)
-        kNN = temp.obsp["distances"]
-
-        # Adaptive k
-        adaptive_k = int(np.floor(knn / 3))
-        adaptive_std = np.zeros(N)
-
-        for i in np.arange(len(adaptive_std)):
-            adaptive_std[i] = np.sort(kNN.data[kNN.indptr[i] : kNN.indptr[i + 1]])[
-                adaptive_k - 1
-            ]
-
-        # Kernel
-        x, y, dists = find(kNN)
-
-        # X, y specific stds
-        dists = dists / adaptive_std[x]
-        W = csr_matrix((np.exp(-dists), (x, y)), shape=[N, N])
-
-        # Diffusion components
-        kernel = W + W.T
+        kernel = compute_kernel(data_df, knn, alpha)
     else:
+        warn(
+            "'data' is a sparse matrix and will be interpreted as kernel. "
+            "To avoid this warning compute diffusion maps from a precompued kernel using "
+            "palantir.utils.diffusion_maps_from_kernel()."
+        )
         kernel = data_df
 
-    # Markov
-    D = np.ravel(kernel.sum(axis=1))
-
-    if alpha > 0:
-        # L_alpha
-        D[D != 0] = D[D != 0] ** (-alpha)
-        mat = csr_matrix((D, (range(N), range(N))), shape=[N, N])
-        kernel = mat.dot(kernel).dot(mat)
-        D = np.ravel(kernel.sum(axis=1))
-
-    D[D != 0] = 1 / D[D != 0]
-    T = csr_matrix((D, (range(N), range(N))), shape=[N, N]).dot(kernel)
-    # Eigen value dcomposition
-    np.random.seed(seed)
-    v0 = np.random.rand(min(T.shape))
-    D, V = eigs(T, n_components, tol=1e-4, maxiter=1000, v0=v0)
-    D = np.real(D)
-    V = np.real(V)
-    inds = np.argsort(D)[::-1]
-    D = D[inds]
-    V = V[:, inds]
-
-    # Normalize
-    for i in range(V.shape[1]):
-        V[:, i] = V[:, i] / np.linalg.norm(V[:, i])
+    res = diffusion_maps_from_kernel(kernel, n_components, seed)
 
-    # Create are results dictionary
-    res = {"T": T, "EigenVectors": V, "EigenValues": D}
-    res["EigenVectors"] = pd.DataFrame(res["EigenVectors"])
-    if not issparse(data_df):
-        res["EigenVectors"].index = data_df.index
-    res["EigenValues"] = pd.Series(res["EigenValues"])
     res["kernel"] = kernel
 
     if isinstance(data, sc.AnnData):

From ae7e68f901ba2c582dc56bdd1b5a63d9618fafb5 Mon Sep 17 00:00:00 2001
From: Dominik <dotto@fredhutch.org>
Date: Thu, 28 Sep 2023 16:33:54 -0700
Subject: [PATCH 2/4] correct version.py to v1.3.1

---
 src/palantir/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/palantir/version.py b/src/palantir/version.py
index 385c7d0d..a6239769 100644
--- a/src/palantir/version.py
+++ b/src/palantir/version.py
@@ -1,3 +1,3 @@
-__version__ = "1.3.2"
+__version__ = "1.3.1"
 __author__ = "Palantir development team"
 __author_email__ = "manu.talanki@gmail.com"

From 33e6abd3487fc888762d613e15f6467eda0f29b2 Mon Sep 17 00:00:00 2001
From: Dominik <dotto@fredhutch.org>
Date: Thu, 28 Sep 2023 16:34:14 -0700
Subject: [PATCH 3/4] remove dependency `tables`

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 987fcfa9..f1d2997e 100755
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,6 @@
         "joblib",
         "fcsparser>=0.1.2",
         "leidenalg>=0.9.1",
-        "tables>=3.4.2",
         "Cython",
         "cmake",
         "matplotlib>=2.2.2",

From bc06821a86db7aadb500feb18e9a5078220d486d Mon Sep 17 00:00:00 2001
From: Dominik <dotto@fredhutch.org>
Date: Thu, 28 Sep 2023 16:43:11 -0700
Subject: [PATCH 4/4] update changelog about drop of depenency `tables`

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 13e1a870..8d72ee8a 100755
--- a/README.md
+++ b/README.md
@@ -112,6 +112,7 @@ Release Notes
  * reduce memory usgae of `palantir.presults.compute_gene_trends`
  * removed seaborn dependency
  * refactor `run_diffusion_maps` to split out `compute_kernel` and `diffusion_maps_from_kernel`
+ * remove unused dependency `tables`
 
  ### Version 1.3.0