feat: initial commit - Phase 1 & 2 core features

This commit is contained in:
hiderfong
2026-04-22 17:07:33 +08:00
commit 1773bda06b
25005 changed files with 6252106 additions and 0 deletions
@@ -0,0 +1,21 @@
"""
The :mod:`sklearn.manifold` module implements data embedding techniques.
"""
from ._isomap import Isomap
from ._locally_linear import LocallyLinearEmbedding, locally_linear_embedding
from ._mds import MDS, smacof
from ._spectral_embedding import SpectralEmbedding, spectral_embedding
from ._t_sne import TSNE, trustworthiness
__all__ = [
"locally_linear_embedding",
"LocallyLinearEmbedding",
"Isomap",
"MDS",
"smacof",
"SpectralEmbedding",
"spectral_embedding",
"TSNE",
"trustworthiness",
]
@@ -0,0 +1,295 @@
# Author: Christopher Moody <chrisemoody@gmail.com>
# Author: Nick Travers <nickt@squareup.com>
# Implementation by Chris Moody & Nick Travers
# See http://homepage.tudelft.nl/19j49/t-SNE.html for reference
# implementations and papers describing the technique
import numpy as np
cimport numpy as cnp
from libc.stdio cimport printf
from libc.math cimport log
from libc.stdlib cimport malloc, free
from libc.time cimport clock, clock_t
from cython.parallel cimport prange, parallel
from ..neighbors._quad_tree cimport _QuadTree
cnp.import_array()
cdef char* EMPTY_STRING = ""
# Smallest strictly positive value that can be represented by floating
# point numbers for different precision levels. This is useful to avoid
# taking the log of zero when computing the KL divergence.
cdef float FLOAT32_TINY = np.finfo(np.float32).tiny
# Useful to void division by zero or divergence to +inf.
cdef float FLOAT64_EPS = np.finfo(np.float64).eps
# This is effectively an ifdef statement in Cython
# It allows us to write printf debugging lines
# and remove them at compile time
cdef enum:
DEBUGFLAG = 0
cdef float compute_gradient(float[:] val_P,
float[:, :] pos_reference,
cnp.int64_t[:] neighbors,
cnp.int64_t[:] indptr,
float[:, :] tot_force,
_QuadTree qt,
float theta,
int dof,
long start,
bint compute_error,
int num_threads) noexcept nogil:
# Having created the tree, calculate the gradient
# in two components, the positive and negative forces
cdef:
long i, coord
int ax
long n_samples = pos_reference.shape[0]
int n_dimensions = qt.n_dimensions
clock_t t1 = 0, t2 = 0
double sQ
float error
int take_timing = 1 if qt.verbose > 15 else 0
if qt.verbose > 11:
printf("[t-SNE] Allocating %li elements in force arrays\n",
n_samples * n_dimensions * 2)
cdef float* neg_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
cdef float* pos_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
if take_timing:
t1 = clock()
sQ = compute_gradient_negative(pos_reference, neg_f, qt, dof, theta, start,
num_threads)
if take_timing:
t2 = clock()
printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1)))
if take_timing:
t1 = clock()
error = compute_gradient_positive(val_P, pos_reference, neighbors, indptr,
pos_f, n_dimensions, dof, sQ, start,
qt.verbose, compute_error, num_threads)
if take_timing:
t2 = clock()
printf("[t-SNE] Computing positive gradient: %e ticks\n",
((float) (t2 - t1)))
for i in prange(start, n_samples, nogil=True, num_threads=num_threads,
schedule='static'):
for ax in range(n_dimensions):
coord = i * n_dimensions + ax
tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sQ)
free(neg_f)
free(pos_f)
return error
cdef float compute_gradient_positive(float[:] val_P,
float[:, :] pos_reference,
cnp.int64_t[:] neighbors,
cnp.int64_t[:] indptr,
float* pos_f,
int n_dimensions,
int dof,
double sum_Q,
cnp.int64_t start,
int verbose,
bint compute_error,
int num_threads) noexcept nogil:
# Sum over the following expression for i not equal to j
# grad_i = p_ij (1 + ||y_i - y_j||^2)^-1 (y_i - y_j)
# This is equivalent to compute_edge_forces in the authors' code
# It just goes over the nearest neighbors instead of all the data points
# (unlike the non-nearest neighbors version of `compute_gradient_positive')
cdef:
int ax
long i, j, k
long n_samples = indptr.shape[0] - 1
float C = 0.0
float dij, qij, pij
float exponent = (dof + 1.0) / 2.0
float float_dof = (float) (dof)
float* buff
clock_t t1 = 0, t2 = 0
float dt
if verbose > 10:
t1 = clock()
with nogil, parallel(num_threads=num_threads):
# Define private buffer variables
buff = <float *> malloc(sizeof(float) * n_dimensions)
for i in prange(start, n_samples, schedule='static'):
# Init the gradient vector
for ax in range(n_dimensions):
pos_f[i * n_dimensions + ax] = 0.0
# Compute the positive interaction for the nearest neighbors
for k in range(indptr[i], indptr[i+1]):
j = neighbors[k]
dij = 0.0
pij = val_P[k]
for ax in range(n_dimensions):
buff[ax] = pos_reference[i, ax] - pos_reference[j, ax]
dij += buff[ax] * buff[ax]
qij = float_dof / (float_dof + dij)
if dof != 1: # i.e. exponent != 1
qij = qij ** exponent
dij = pij * qij
# only compute the error when needed
if compute_error:
qij = qij / sum_Q
C += pij * log(max(pij, FLOAT32_TINY) / max(qij, FLOAT32_TINY))
for ax in range(n_dimensions):
pos_f[i * n_dimensions + ax] += dij * buff[ax]
free(buff)
if verbose > 10:
t2 = clock()
dt = ((float) (t2 - t1))
printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt)
return C
cdef double compute_gradient_negative(float[:, :] pos_reference,
float* neg_f,
_QuadTree qt,
int dof,
float theta,
long start,
int num_threads) noexcept nogil:
cdef:
int ax
int n_dimensions = qt.n_dimensions
int offset = n_dimensions + 2
long i, j, idx
long n_samples = pos_reference.shape[0]
long n = n_samples - start
long dta = 0
long dtb = 0
float size, dist2s, mult
float exponent = (dof + 1.0) / 2.0
float float_dof = (float) (dof)
double qijZ, sum_Q = 0.0
float* force
float* neg_force
float* pos
clock_t t1 = 0, t2 = 0, t3 = 0
int take_timing = 1 if qt.verbose > 20 else 0
with nogil, parallel(num_threads=num_threads):
# Define thread-local buffers
summary = <float*> malloc(sizeof(float) * n * offset)
pos = <float *> malloc(sizeof(float) * n_dimensions)
force = <float *> malloc(sizeof(float) * n_dimensions)
neg_force = <float *> malloc(sizeof(float) * n_dimensions)
for i in prange(start, n_samples, schedule='static'):
# Clear the arrays
for ax in range(n_dimensions):
force[ax] = 0.0
neg_force[ax] = 0.0
pos[ax] = pos_reference[i, ax]
# Find which nodes are summarizing and collect their centers of mass
# deltas, and sizes, into vectorized arrays
if take_timing:
t1 = clock()
idx = qt.summarize(pos, summary, theta*theta)
if take_timing:
t2 = clock()
# Compute the t-SNE negative force
# for the digits dataset, walking the tree
# is about 10-15x more expensive than the
# following for loop
for j in range(idx // offset):
dist2s = summary[j * offset + n_dimensions]
size = summary[j * offset + n_dimensions + 1]
qijZ = float_dof / (float_dof + dist2s) # 1/(1+dist)
if dof != 1: # i.e. exponent != 1
qijZ = qijZ ** exponent
sum_Q += size * qijZ # size of the node * q
mult = size * qijZ * qijZ
for ax in range(n_dimensions):
neg_force[ax] += mult * summary[j * offset + ax]
if take_timing:
t3 = clock()
for ax in range(n_dimensions):
neg_f[i * n_dimensions + ax] = neg_force[ax]
if take_timing:
dta += t2 - t1
dtb += t3 - t2
free(pos)
free(force)
free(neg_force)
free(summary)
if take_timing:
printf("[t-SNE] Tree: %li clock ticks | ", dta)
printf("Force computation: %li clock ticks\n", dtb)
# Put sum_Q to machine EPSILON to avoid divisions by 0
sum_Q = max(sum_Q, FLOAT64_EPS)
return sum_Q
def gradient(float[:] val_P,
float[:, :] pos_output,
cnp.int64_t[:] neighbors,
cnp.int64_t[:] indptr,
float[:, :] forces,
float theta,
int n_dimensions,
int verbose,
int dof=1,
long skip_num_points=0,
bint compute_error=1,
int num_threads=1):
# This function is designed to be called from external Python
# it passes the 'forces' array by reference and fills that's array
# up in-place
cdef float C
cdef int n
n = pos_output.shape[0]
assert val_P.itemsize == 4
assert pos_output.itemsize == 4
assert forces.itemsize == 4
m = "Forces array and pos_output shapes are incompatible"
assert n == forces.shape[0], m
m = "Pij and pos_output shapes are incompatible"
assert n == indptr.shape[0] - 1, m
if verbose > 10:
printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions)
cdef _QuadTree qt = _QuadTree(pos_output.shape[1], verbose)
if verbose > 10:
printf("[t-SNE] Inserting %li points\n", pos_output.shape[0])
qt.build_tree(pos_output)
if verbose > 10:
# XXX: format hack to workaround lack of `const char *` type
# in the generated C code that triggers error with gcc 4.9
# and -Werror=format-security
printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING)
C = compute_gradient(val_P, pos_output, neighbors, indptr, forces,
qt, theta, dof, skip_num_points, compute_error,
num_threads)
if verbose > 10:
# XXX: format hack to workaround lack of `const char *` type
# in the generated C code
# and -Werror=format-security
printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING)
m = "Tree consistency failed: unexpected number of points on the tree"
assert qt.cells[0].cumulative_size == qt.n_points, m
if not compute_error:
C = np.nan
return C
@@ -0,0 +1,438 @@
"""Isomap for manifold learning"""
# Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
# License: BSD 3 clause (C) 2011
import warnings
from numbers import Integral, Real
import numpy as np
from scipy.sparse import issparse
from scipy.sparse.csgraph import connected_components, shortest_path
from ..base import (
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
TransformerMixin,
_fit_context,
)
from ..decomposition import KernelPCA
from ..metrics.pairwise import _VALID_METRICS
from ..neighbors import NearestNeighbors, kneighbors_graph, radius_neighbors_graph
from ..preprocessing import KernelCenterer
from ..utils._param_validation import Interval, StrOptions
from ..utils.graph import _fix_connected_components
from ..utils.validation import check_is_fitted
class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
"""Isomap Embedding.
Non-linear dimensionality reduction through Isometric Mapping
Read more in the :ref:`User Guide <isomap>`.
Parameters
----------
n_neighbors : int or None, default=5
Number of neighbors to consider for each point. If `n_neighbors` is an int,
then `radius` must be `None`.
radius : float or None, default=None
Limiting distance of neighbors to return. If `radius` is a float,
then `n_neighbors` must be set to `None`.
.. versionadded:: 1.1
n_components : int, default=2
Number of coordinates for the manifold.
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
'auto' : Attempt to choose the most efficient solver
for the given problem.
'arpack' : Use Arnoldi decomposition to find the eigenvalues
and eigenvectors.
'dense' : Use a direct solver (i.e. LAPACK)
for the eigenvalue decomposition.
tol : float, default=0
Convergence tolerance passed to arpack or lobpcg.
not used if eigen_solver == 'dense'.
max_iter : int, default=None
Maximum number of iterations for the arpack solver.
not used if eigen_solver == 'dense'.
path_method : {'auto', 'FW', 'D'}, default='auto'
Method to use in finding shortest path.
'auto' : attempt to choose the best algorithm automatically.
'FW' : Floyd-Warshall algorithm.
'D' : Dijkstra's algorithm.
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
default='auto'
Algorithm to use for nearest neighbors search,
passed to neighbors.NearestNeighbors instance.
n_jobs : int or None, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
metric : str, or callable, default="minkowski"
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a :term:`Glossary <sparse graph>`.
.. versionadded:: 0.22
p : float, default=2
Parameter for the Minkowski metric from
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
.. versionadded:: 0.22
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.22
Attributes
----------
embedding_ : array-like, shape (n_samples, n_components)
Stores the embedding vectors.
kernel_pca_ : object
:class:`~sklearn.decomposition.KernelPCA` object used to implement the
embedding.
nbrs_ : sklearn.neighbors.NearestNeighbors instance
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
dist_matrix_ : array-like, shape (n_samples, n_samples)
Stores the geodesic distance matrix of training data.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
sklearn.decomposition.PCA : Principal component analysis that is a linear
dimensionality reduction method.
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
kernels and PCA.
MDS : Manifold learning using multidimensional scaling.
TSNE : T-distributed Stochastic Neighbor Embedding.
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
References
----------
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
framework for nonlinear dimensionality reduction. Science 290 (5500)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import Isomap
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = Isomap(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
_parameter_constraints: dict = {
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
"radius": [Interval(Real, 0, None, closed="both"), None],
"n_components": [Interval(Integral, 1, None, closed="left")],
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
"tol": [Interval(Real, 0, None, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left"), None],
"path_method": [StrOptions({"auto", "FW", "D"})],
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
"n_jobs": [Integral, None],
"p": [Interval(Real, 1, None, closed="left")],
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
"metric_params": [dict, None],
}
def __init__(
self,
*,
n_neighbors=5,
radius=None,
n_components=2,
eigen_solver="auto",
tol=0,
max_iter=None,
path_method="auto",
neighbors_algorithm="auto",
n_jobs=None,
metric="minkowski",
p=2,
metric_params=None,
):
self.n_neighbors = n_neighbors
self.radius = radius
self.n_components = n_components
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.path_method = path_method
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
self.metric = metric
self.p = p
self.metric_params = metric_params
def _fit_transform(self, X):
if self.n_neighbors is not None and self.radius is not None:
raise ValueError(
"Both n_neighbors and radius are provided. Use"
f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use"
" radius-based neighbors"
)
self.nbrs_ = NearestNeighbors(
n_neighbors=self.n_neighbors,
radius=self.radius,
algorithm=self.neighbors_algorithm,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
n_jobs=self.n_jobs,
)
self.nbrs_.fit(X)
self.n_features_in_ = self.nbrs_.n_features_in_
if hasattr(self.nbrs_, "feature_names_in_"):
self.feature_names_in_ = self.nbrs_.feature_names_in_
self.kernel_pca_ = KernelPCA(
n_components=self.n_components,
kernel="precomputed",
eigen_solver=self.eigen_solver,
tol=self.tol,
max_iter=self.max_iter,
n_jobs=self.n_jobs,
).set_output(transform="default")
if self.n_neighbors is not None:
nbg = kneighbors_graph(
self.nbrs_,
self.n_neighbors,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
mode="distance",
n_jobs=self.n_jobs,
)
else:
nbg = radius_neighbors_graph(
self.nbrs_,
radius=self.radius,
metric=self.metric,
p=self.p,
metric_params=self.metric_params,
mode="distance",
n_jobs=self.n_jobs,
)
# Compute the number of connected components, and connect the different
# components to be able to compute a shortest path between all pairs
# of samples in the graph.
# Similar fix to cluster._agglomerative._fix_connectivity.
n_connected_components, labels = connected_components(nbg)
if n_connected_components > 1:
if self.metric == "precomputed" and issparse(X):
raise RuntimeError(
"The number of connected components of the neighbors graph"
f" is {n_connected_components} > 1. The graph cannot be "
"completed with metric='precomputed', and Isomap cannot be"
"fitted. Increase the number of neighbors to avoid this "
"issue, or precompute the full distance matrix instead "
"of passing a sparse neighbors graph."
)
warnings.warn(
(
"The number of connected components of the neighbors graph "
f"is {n_connected_components} > 1. Completing the graph to fit"
" Isomap might be slow. Increase the number of neighbors to "
"avoid this issue."
),
stacklevel=2,
)
# use array validated by NearestNeighbors
nbg = _fix_connected_components(
X=self.nbrs_._fit_X,
graph=nbg,
n_connected_components=n_connected_components,
component_labels=labels,
mode="distance",
metric=self.nbrs_.effective_metric_,
**self.nbrs_.effective_metric_params_,
)
self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False)
if self.nbrs_._fit_X.dtype == np.float32:
self.dist_matrix_ = self.dist_matrix_.astype(
self.nbrs_._fit_X.dtype, copy=False
)
G = self.dist_matrix_**2
G *= -0.5
self.embedding_ = self.kernel_pca_.fit_transform(G)
self._n_features_out = self.embedding_.shape[1]
def reconstruction_error(self):
"""Compute the reconstruction error for the embedding.
Returns
-------
reconstruction_error : float
Reconstruction error.
Notes
-----
The cost function of an isomap embedding is
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
Where D is the matrix of distances for the input data X,
D_fit is the matrix of distances for the output embedding X_fit,
and K is the isomap kernel:
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
"""
G = -0.5 * self.dist_matrix_**2
G_center = KernelCenterer().fit_transform(G)
evals = self.kernel_pca_.eigenvalues_
return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
@_fit_context(
# Isomap.metric is not validated yet
prefer_skip_nested_validation=False
)
def fit(self, X, y=None):
"""Compute the embedding vectors for data X.
Parameters
----------
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array, sparse matrix, precomputed tree, or NearestNeighbors
object.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns a fitted instance of self.
"""
self._fit_transform(X)
return self
@_fit_context(
# Isomap.metric is not validated yet
prefer_skip_nested_validation=False
)
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse matrix, BallTree, KDTree}
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
X transformed in the new space.
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""Transform X.
This is implemented by linking the points X into the graph of geodesic
distances of the training data. First the `n_neighbors` nearest
neighbors of X are found in the training data, and from these the
shortest geodesic distances from each point in X to each point in
the training data are computed in order to construct the kernel.
The embedding of X is the projection of this kernel onto the
embedding vectors of the training set.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_queries, n_features)
If neighbors_algorithm='precomputed', X is assumed to be a
distance matrix or a sparse graph of shape
(n_queries, n_samples_fit).
Returns
-------
X_new : array-like, shape (n_queries, n_components)
X transformed in the new space.
"""
check_is_fitted(self)
if self.n_neighbors is not None:
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
else:
distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True)
# Create the graph of shortest distances from X to
# training data via the nearest neighbors of X.
# This can be done as a single array operation, but it potentially
# takes a lot of memory. To avoid that, use a loop:
n_samples_fit = self.nbrs_.n_samples_fit_
n_queries = distances.shape[0]
if hasattr(X, "dtype") and X.dtype == np.float32:
dtype = np.float32
else:
dtype = np.float64
G_X = np.zeros((n_queries, n_samples_fit), dtype)
for i in range(n_queries):
G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)
G_X **= 2
G_X *= -0.5
return self.kernel_pca_.transform(G_X)
def _more_tags(self):
return {"preserves_dtype": [np.float64, np.float32]}
@@ -0,0 +1,880 @@
"""Locally Linear Embedding"""
# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
# Jake Vanderplas -- <vanderplas@astro.washington.edu>
# License: BSD 3 clause (C) INRIA 2011
from numbers import Integral, Real
import numpy as np
from scipy.linalg import eigh, qr, solve, svd
from scipy.sparse import csr_matrix, eye
from scipy.sparse.linalg import eigsh
from ..base import (
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
TransformerMixin,
_fit_context,
_UnstableArchMixin,
)
from ..neighbors import NearestNeighbors
from ..utils import check_array, check_random_state
from ..utils._arpack import _init_arpack_v0
from ..utils._param_validation import Interval, StrOptions, validate_params
from ..utils.extmath import stable_cumsum
from ..utils.validation import FLOAT_DTYPES, check_is_fitted
def barycenter_weights(X, Y, indices, reg=1e-3):
"""Compute barycenter weights of X from Y along the first axis
We estimate the weights to assign to each point in Y[indices] to recover
the point X[i]. The barycenter weights sum to 1.
Parameters
----------
X : array-like, shape (n_samples, n_dim)
Y : array-like, shape (n_samples, n_dim)
indices : array-like, shape (n_samples, n_dim)
Indices of the points in Y used to compute the barycenter
reg : float, default=1e-3
Amount of regularization to add for the problem to be
well-posed in the case of n_neighbors > n_dim
Returns
-------
B : array-like, shape (n_samples, n_neighbors)
Notes
-----
See developers note for more information.
"""
X = check_array(X, dtype=FLOAT_DTYPES)
Y = check_array(Y, dtype=FLOAT_DTYPES)
indices = check_array(indices, dtype=int)
n_samples, n_neighbors = indices.shape
assert X.shape[0] == n_samples
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
v = np.ones(n_neighbors, dtype=X.dtype)
# this might raise a LinalgError if G is singular and has trace
# zero
for i, ind in enumerate(indices):
A = Y[ind]
C = A - X[i] # broadcasting
G = np.dot(C, C.T)
trace = np.trace(G)
if trace > 0:
R = reg * trace
else:
R = reg
G.flat[:: n_neighbors + 1] += R
w = solve(G, v, assume_a="pos")
B[i, :] = w / np.sum(w)
return B
def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
"""Computes the barycenter weighted graph of k-Neighbors for points in X
Parameters
----------
X : {array-like, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array or a NearestNeighbors object.
n_neighbors : int
Number of neighbors for each sample.
reg : float, default=1e-3
Amount of regularization when solving the least-squares
problem. Only relevant if mode='barycenter'. If None, use the
default.
n_jobs : int or None, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
A : sparse matrix in CSR format, shape = [n_samples, n_samples]
A[i, j] is assigned the weight of edge that connects i to j.
See Also
--------
sklearn.neighbors.kneighbors_graph
sklearn.neighbors.radius_neighbors_graph
"""
knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
X = knn._fit_X
n_samples = knn.n_samples_fit_
ind = knn.kneighbors(X, return_distance=False)[:, 1:]
data = barycenter_weights(X, X, ind, reg=reg)
indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples))
def null_space(
M, k, k_skip=1, eigen_solver="arpack", tol=1e-6, max_iter=100, random_state=None
):
"""
Find the null space of a matrix M.
Parameters
----------
M : {array, matrix, sparse matrix, LinearOperator}
Input covariance matrix: should be symmetric positive semi-definite
k : int
Number of eigenvalues/vectors to return
k_skip : int, default=1
Number of low eigenvalues to skip.
eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems. It is
best to try several random seeds in order to check results.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, default=1e-6
Tolerance for 'arpack' method.
Not used if eigen_solver=='dense'.
max_iter : int, default=100
Maximum number of iterations for 'arpack' method.
Not used if eigen_solver=='dense'
random_state : int, RandomState instance, default=None
Determines the random number generator when ``solver`` == 'arpack'.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
"""
if eigen_solver == "auto":
if M.shape[0] > 200 and k + k_skip < 10:
eigen_solver = "arpack"
else:
eigen_solver = "dense"
if eigen_solver == "arpack":
v0 = _init_arpack_v0(M.shape[0], random_state)
try:
eigen_values, eigen_vectors = eigsh(
M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0
)
except RuntimeError as e:
raise ValueError(
"Error in determining null-space with ARPACK. Error message: "
"'%s'. Note that eigen_solver='arpack' can fail when the "
"weight matrix is singular or otherwise ill-behaved. In that "
"case, eigen_solver='dense' is recommended. See online "
"documentation for more information." % e
) from e
return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
elif eigen_solver == "dense":
if hasattr(M, "toarray"):
M = M.toarray()
eigen_values, eigen_vectors = eigh(
M, subset_by_index=(k_skip, k + k_skip - 1), overwrite_a=True
)
index = np.argsort(np.abs(eigen_values))
return eigen_vectors[:, index], np.sum(eigen_values)
else:
raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
def _locally_linear_embedding(
X,
*,
n_neighbors,
n_components,
reg=1e-3,
eigen_solver="auto",
tol=1e-6,
max_iter=100,
method="standard",
hessian_tol=1e-4,
modified_tol=1e-12,
random_state=None,
n_jobs=None,
):
nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)
nbrs.fit(X)
X = nbrs._fit_X
N, d_in = X.shape
if n_components > d_in:
raise ValueError(
"output dimension must be less than or equal to input dimension"
)
if n_neighbors >= N:
raise ValueError(
"Expected n_neighbors <= n_samples, but n_samples = %d, n_neighbors = %d"
% (N, n_neighbors)
)
M_sparse = eigen_solver != "dense"
if method == "standard":
W = barycenter_kneighbors_graph(
nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs
)
# we'll compute M = (I-W)'(I-W)
# depending on the solver, we'll do this differently
if M_sparse:
M = eye(*W.shape, format=W.format) - W
M = (M.T * M).tocsr()
else:
M = (W.T * W - W.T - W).toarray()
M.flat[:: M.shape[0] + 1] += 1 # W = W - I = W - I
elif method == "hessian":
dp = n_components * (n_components + 1) // 2
if n_neighbors <= n_components + dp:
raise ValueError(
"for method='hessian', n_neighbors must be "
"greater than "
"[n_components * (n_components + 3) / 2]"
)
neighbors = nbrs.kneighbors(
X, n_neighbors=n_neighbors + 1, return_distance=False
)
neighbors = neighbors[:, 1:]
Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
Yi[:, 0] = 1
M = np.zeros((N, N), dtype=np.float64)
use_svd = n_neighbors > d_in
for i in range(N):
Gi = X[neighbors[i]]
Gi -= Gi.mean(0)
# build Hessian estimator
if use_svd:
U = svd(Gi, full_matrices=0)[0]
else:
Ci = np.dot(Gi, Gi.T)
U = eigh(Ci)[1][:, ::-1]
Yi[:, 1 : 1 + n_components] = U[:, :n_components]
j = 1 + n_components
for k in range(n_components):
Yi[:, j : j + n_components - k] = U[:, k : k + 1] * U[:, k:n_components]
j += n_components - k
Q, R = qr(Yi)
w = Q[:, n_components + 1 :]
S = w.sum(0)
S[np.where(abs(S) < hessian_tol)] = 1
w /= S
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(w, w.T)
if M_sparse:
M = csr_matrix(M)
elif method == "modified":
if n_neighbors < n_components:
raise ValueError("modified LLE requires n_neighbors >= n_components")
neighbors = nbrs.kneighbors(
X, n_neighbors=n_neighbors + 1, return_distance=False
)
neighbors = neighbors[:, 1:]
# find the eigenvectors and eigenvalues of each local covariance
# matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix,
# where the columns are eigenvectors
V = np.zeros((N, n_neighbors, n_neighbors))
nev = min(d_in, n_neighbors)
evals = np.zeros([N, nev])
# choose the most efficient way to find the eigenvectors
use_svd = n_neighbors > d_in
if use_svd:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
V[i], evals[i], _ = svd(X_nbrs, full_matrices=True)
evals **= 2
else:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
C_nbrs = np.dot(X_nbrs, X_nbrs.T)
evi, vi = eigh(C_nbrs)
evals[i] = evi[::-1]
V[i] = vi[:, ::-1]
# find regularized weights: this is like normal LLE.
# because we've already computed the SVD of each covariance matrix,
# it's faster to use this rather than np.linalg.solve
reg = 1e-3 * evals.sum(1)
tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))
tmp[:, :nev] /= evals + reg[:, None]
tmp[:, nev:] /= reg[:, None]
w_reg = np.zeros((N, n_neighbors))
for i in range(N):
w_reg[i] = np.dot(V[i], tmp[i])
w_reg /= w_reg.sum(1)[:, None]
# calculate eta: the median of the ratio of small to large eigenvalues
# across the points. This is used to determine s_i, below
rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)
eta = np.median(rho)
# find s_i, the size of the "almost null space" for each point:
# this is the size of the largest set of eigenvalues
# such that Sum[v; v in set]/Sum[v; v not in set] < eta
s_range = np.zeros(N, dtype=int)
evals_cumsum = stable_cumsum(evals, 1)
eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
for i in range(N):
s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)
s_range += n_neighbors - nev # number of zero eigenvalues
# Now calculate M.
# This is the [N x N] matrix whose null space is the desired embedding
M = np.zeros((N, N), dtype=np.float64)
for i in range(N):
s_i = s_range[i]
# select bottom s_i eigenvectors and calculate alpha
Vi = V[i, :, n_neighbors - s_i :]
alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
# compute Householder matrix which satisfies
# Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s)
# using prescription from paper
h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors))
norm_h = np.linalg.norm(h)
if norm_h < modified_tol:
h *= 0
else:
h /= norm_h
# Householder matrix is
# >> Hi = np.identity(s_i) - 2*np.outer(h,h)
# Then the weight matrix is
# >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None]
# We do this much more efficiently:
Wi = Vi - 2 * np.outer(np.dot(Vi, h), h) + (1 - alpha_i) * w_reg[i, :, None]
# Update M as follows:
# >> W_hat = np.zeros( (N,s_i) )
# >> W_hat[neighbors[i],:] = Wi
# >> W_hat[i] -= 1
# >> M += np.dot(W_hat,W_hat.T)
# We can do this much more efficiently:
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
Wi_sum1 = Wi.sum(1)
M[i, neighbors[i]] -= Wi_sum1
M[neighbors[i], i] -= Wi_sum1
M[i, i] += s_i
if M_sparse:
M = csr_matrix(M)
elif method == "ltsa":
neighbors = nbrs.kneighbors(
X, n_neighbors=n_neighbors + 1, return_distance=False
)
neighbors = neighbors[:, 1:]
M = np.zeros((N, N))
use_svd = n_neighbors > d_in
for i in range(N):
Xi = X[neighbors[i]]
Xi -= Xi.mean(0)
# compute n_components largest eigenvalues of Xi * Xi^T
if use_svd:
v = svd(Xi, full_matrices=True)[0]
else:
Ci = np.dot(Xi, Xi.T)
v = eigh(Ci)[1][:, ::-1]
Gi = np.zeros((n_neighbors, n_components + 1))
Gi[:, 1:] = v[:, :n_components]
Gi[:, 0] = 1.0 / np.sqrt(n_neighbors)
GiGiT = np.dot(Gi, Gi.T)
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] -= GiGiT
M[neighbors[i], neighbors[i]] += 1
return null_space(
M,
n_components,
k_skip=1,
eigen_solver=eigen_solver,
tol=tol,
max_iter=max_iter,
random_state=random_state,
)
@validate_params(
{
"X": ["array-like", NearestNeighbors],
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
"n_components": [Interval(Integral, 1, None, closed="left")],
"reg": [Interval(Real, 0, None, closed="left")],
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
"tol": [Interval(Real, 0, None, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
"hessian_tol": [Interval(Real, 0, None, closed="left")],
"modified_tol": [Interval(Real, 0, None, closed="left")],
"random_state": ["random_state"],
"n_jobs": [None, Integral],
},
prefer_skip_nested_validation=True,
)
def locally_linear_embedding(
X,
*,
n_neighbors,
n_components,
reg=1e-3,
eigen_solver="auto",
tol=1e-6,
max_iter=100,
method="standard",
hessian_tol=1e-4,
modified_tol=1e-12,
random_state=None,
n_jobs=None,
):
"""Perform a Locally Linear Embedding analysis on the data.
Read more in the :ref:`User Guide <locally_linear_embedding>`.
Parameters
----------
X : {array-like, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array or a NearestNeighbors object.
n_neighbors : int
Number of neighbors to consider for each point.
n_components : int
Number of coordinates for the manifold.
reg : float, default=1e-3
Regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems. It is
best to try several random seeds in order to check results.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, default=1e-6
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : int, default=100
Maximum number of iterations for the arpack solver.
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
standard : use the standard locally linear embedding algorithm.
see reference [1]_
hessian : use the Hessian eigenmap method. This method requires
n_neighbors > n_components * (1 + (n_components + 1) / 2.
see reference [2]_
modified : use the modified locally linear embedding algorithm.
see reference [3]_
ltsa : use local tangent space alignment algorithm
see reference [4]_
hessian_tol : float, default=1e-4
Tolerance for Hessian eigenmapping method.
Only used if method == 'hessian'.
modified_tol : float, default=1e-12
Tolerance for modified LLE method.
Only used if method == 'modified'.
random_state : int, RandomState instance, default=None
Determines the random number generator when ``solver`` == 'arpack'.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
n_jobs : int or None, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
Y : ndarray of shape (n_samples, n_components)
Embedding vectors.
squared_error : float
Reconstruction error for the embedding vectors. Equivalent to
``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.
References
----------
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding. Science 290:2323 (2000).
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A. 100:5591 (2003).
.. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
<https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ. 8:406 (2004)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import locally_linear_embedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding, _ = locally_linear_embedding(X[:100],n_neighbors=5, n_components=2)
>>> embedding.shape
(100, 2)
"""
return _locally_linear_embedding(
X=X,
n_neighbors=n_neighbors,
n_components=n_components,
reg=reg,
eigen_solver=eigen_solver,
tol=tol,
max_iter=max_iter,
method=method,
hessian_tol=hessian_tol,
modified_tol=modified_tol,
random_state=random_state,
n_jobs=n_jobs,
)
class LocallyLinearEmbedding(
ClassNamePrefixFeaturesOutMixin,
TransformerMixin,
_UnstableArchMixin,
BaseEstimator,
):
"""Locally Linear Embedding.
Read more in the :ref:`User Guide <locally_linear_embedding>`.
Parameters
----------
n_neighbors : int, default=5
Number of neighbors to consider for each point.
n_components : int, default=2
Number of coordinates for the manifold.
reg : float, default=1e-3
Regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
The solver used to compute the eigenvectors. The available options are:
- `'auto'` : algorithm will attempt to choose the best method for input
data.
- `'arpack'` : use arnoldi iteration in shift-invert mode. For this
method, M may be a dense matrix, sparse matrix, or general linear
operator.
- `'dense'` : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array or matrix type.
This method should be avoided for large problems.
.. warning::
ARPACK can be unstable for some problems. It is best to try several
random seeds in order to check results.
tol : float, default=1e-6
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : int, default=100
Maximum number of iterations for the arpack solver.
Not used if eigen_solver=='dense'.
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
- `standard`: use the standard locally linear embedding algorithm. see
reference [1]_
- `hessian`: use the Hessian eigenmap method. This method requires
``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see
reference [2]_
- `modified`: use the modified locally linear embedding algorithm.
see reference [3]_
- `ltsa`: use local tangent space alignment algorithm. see
reference [4]_
hessian_tol : float, default=1e-4
Tolerance for Hessian eigenmapping method.
Only used if ``method == 'hessian'``.
modified_tol : float, default=1e-12
Tolerance for modified LLE method.
Only used if ``method == 'modified'``.
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
default='auto'
Algorithm to use for nearest neighbors search, passed to
:class:`~sklearn.neighbors.NearestNeighbors` instance.
random_state : int, RandomState instance, default=None
Determines the random number generator when
``eigen_solver`` == 'arpack'. Pass an int for reproducible results
across multiple function calls. See :term:`Glossary <random_state>`.
n_jobs : int or None, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
embedding_ : array-like, shape [n_samples, n_components]
Stores the embedding vectors
reconstruction_error_ : float
Reconstruction error associated with `embedding_`
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
nbrs_ : NearestNeighbors object
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
See Also
--------
SpectralEmbedding : Spectral embedding for non-linear dimensionality
reduction.
TSNE : Distributed Stochastic Neighbor Embedding.
References
----------
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding. Science 290:2323 (2000).
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A. 100:5591 (2003).
.. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
<https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ. 8:406 (2004)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import LocallyLinearEmbedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = LocallyLinearEmbedding(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
_parameter_constraints: dict = {
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
"n_components": [Interval(Integral, 1, None, closed="left")],
"reg": [Interval(Real, 0, None, closed="left")],
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
"tol": [Interval(Real, 0, None, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
"hessian_tol": [Interval(Real, 0, None, closed="left")],
"modified_tol": [Interval(Real, 0, None, closed="left")],
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
"random_state": ["random_state"],
"n_jobs": [None, Integral],
}
def __init__(
self,
*,
n_neighbors=5,
n_components=2,
reg=1e-3,
eigen_solver="auto",
tol=1e-6,
max_iter=100,
method="standard",
hessian_tol=1e-4,
modified_tol=1e-12,
neighbors_algorithm="auto",
random_state=None,
n_jobs=None,
):
self.n_neighbors = n_neighbors
self.n_components = n_components
self.reg = reg
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.method = method
self.hessian_tol = hessian_tol
self.modified_tol = modified_tol
self.random_state = random_state
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
def _fit_transform(self, X):
self.nbrs_ = NearestNeighbors(
n_neighbors=self.n_neighbors,
algorithm=self.neighbors_algorithm,
n_jobs=self.n_jobs,
)
random_state = check_random_state(self.random_state)
X = self._validate_data(X, dtype=float)
self.nbrs_.fit(X)
self.embedding_, self.reconstruction_error_ = _locally_linear_embedding(
X=self.nbrs_,
n_neighbors=self.n_neighbors,
n_components=self.n_components,
eigen_solver=self.eigen_solver,
tol=self.tol,
max_iter=self.max_iter,
method=self.method,
hessian_tol=self.hessian_tol,
modified_tol=self.modified_tol,
random_state=random_state,
reg=self.reg,
n_jobs=self.n_jobs,
)
self._n_features_out = self.embedding_.shape[1]
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Compute the embedding vectors for data X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
Fitted `LocallyLinearEmbedding` class instance.
"""
self._fit_transform(X)
return self
@_fit_context(prefer_skip_nested_validation=True)
def fit_transform(self, X, y=None):
"""Compute the embedding vectors for data X and transform X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
Returns the instance itself.
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""
Transform new points into embedding space.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training set.
Returns
-------
X_new : ndarray of shape (n_samples, n_components)
Returns the instance itself.
Notes
-----
Because of scaling performed by this method, it is discouraged to use
it together with methods that are not scale-invariant (like SVMs).
"""
check_is_fitted(self)
X = self._validate_data(X, reset=False)
ind = self.nbrs_.kneighbors(
X, n_neighbors=self.n_neighbors, return_distance=False
)
weights = barycenter_weights(X, self.nbrs_._fit_X, ind, reg=self.reg)
X_new = np.empty((X.shape[0], self.n_components))
for i in range(X.shape[0]):
X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
return X_new
@@ -0,0 +1,653 @@
"""
Multi-dimensional Scaling (MDS).
"""
# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
# License: BSD
import warnings
from numbers import Integral, Real
import numpy as np
from joblib import effective_n_jobs
from ..base import BaseEstimator, _fit_context
from ..isotonic import IsotonicRegression
from ..metrics import euclidean_distances
from ..utils import check_array, check_random_state, check_symmetric
from ..utils._param_validation import Interval, StrOptions, validate_params
from ..utils.parallel import Parallel, delayed
def _smacof_single(
dissimilarities,
metric=True,
n_components=2,
init=None,
max_iter=300,
verbose=0,
eps=1e-3,
random_state=None,
normalized_stress=False,
):
"""Computes multidimensional scaling using SMACOF algorithm.
Parameters
----------
dissimilarities : ndarray of shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Must be symmetric.
metric : bool, default=True
Compute metric or nonmetric SMACOF algorithm.
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
missing values.
n_components : int, default=2
Number of dimensions in which to immerse the dissimilarities. If an
``init`` array is provided, this option is overridden and the shape of
``init`` is used to determine the dimensionality of the embedding
space.
init : ndarray of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the algorithm. By
default, the algorithm is initialized with a randomly chosen array.
max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, default=0
Level of verbosity.
eps : float, default=1e-3
Relative tolerance with respect to stress at which to declare
convergence. The value of `eps` should be tuned separately depending
on whether or not `normalized_stress` is being used.
random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
normalized_stress : bool, default=False
Whether use and return normed stress value (Stress-1) instead of raw
stress calculated by default. Only supported in non-metric MDS. The
caller must ensure that if `normalized_stress=True` then `metric=False`
.. versionadded:: 1.2
Returns
-------
X : ndarray of shape (n_samples, n_components)
Coordinates of the points in a ``n_components``-space.
stress : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
If `normalized_stress=True`, and `metric=False` returns Stress-1.
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
0.1 fair, and 0.2 poor [1]_.
n_iter : int
The number of iterations corresponding to the best stress.
References
----------
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
"""
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
n_samples = dissimilarities.shape[0]
random_state = check_random_state(random_state)
sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
sim_flat_w = sim_flat[sim_flat != 0]
if init is None:
# Randomly choose initial configuration
X = random_state.uniform(size=n_samples * n_components)
X = X.reshape((n_samples, n_components))
else:
# overrides the parameter p
n_components = init.shape[1]
if n_samples != init.shape[0]:
raise ValueError(
"init matrix should be of shape (%d, %d)" % (n_samples, n_components)
)
X = init
old_stress = None
ir = IsotonicRegression()
for it in range(max_iter):
# Compute distance and monotonic regression
dis = euclidean_distances(X)
if metric:
disparities = dissimilarities
else:
dis_flat = dis.ravel()
# dissimilarities with 0 are considered as missing values
dis_flat_w = dis_flat[sim_flat != 0]
# Compute the disparities using a monotonic regression
disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
disparities = dis_flat.copy()
disparities[sim_flat != 0] = disparities_flat
disparities = disparities.reshape((n_samples, n_samples))
disparities *= np.sqrt(
(n_samples * (n_samples - 1) / 2) / (disparities**2).sum()
)
# Compute stress
stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
if normalized_stress:
stress = np.sqrt(stress / ((disparities.ravel() ** 2).sum() / 2))
# Update X using the Guttman transform
dis[dis == 0] = 1e-5
ratio = disparities / dis
B = -ratio
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
X = 1.0 / n_samples * np.dot(B, X)
dis = np.sqrt((X**2).sum(axis=1)).sum()
if verbose >= 2:
print("it: %d, stress %s" % (it, stress))
if old_stress is not None:
if (old_stress - stress / dis) < eps:
if verbose:
print("breaking at iteration %d with stress %s" % (it, stress))
break
old_stress = stress / dis
return X, stress, it + 1
@validate_params(
{
"dissimilarities": ["array-like"],
"metric": ["boolean"],
"n_components": [Interval(Integral, 1, None, closed="left")],
"init": ["array-like", None],
"n_init": [Interval(Integral, 1, None, closed="left")],
"n_jobs": [Integral, None],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"verbose": ["verbose"],
"eps": [Interval(Real, 0, None, closed="left")],
"random_state": ["random_state"],
"return_n_iter": ["boolean"],
"normalized_stress": ["boolean", StrOptions({"auto"})],
},
prefer_skip_nested_validation=True,
)
def smacof(
dissimilarities,
*,
metric=True,
n_components=2,
init=None,
n_init=8,
n_jobs=None,
max_iter=300,
verbose=0,
eps=1e-3,
random_state=None,
return_n_iter=False,
normalized_stress="auto",
):
"""Compute multidimensional scaling using the SMACOF algorithm.
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
multidimensional scaling algorithm which minimizes an objective function
(the *stress*) using a majorization technique. Stress majorization, also
known as the Guttman Transform, guarantees a monotone convergence of
stress, and is more powerful than traditional techniques such as gradient
descent.
The SMACOF algorithm for metric MDS can be summarized by the following
steps:
1. Set an initial start configuration, randomly or not.
2. Compute the stress
3. Compute the Guttman Transform
4. Iterate 2 and 3 until convergence.
The nonmetric algorithm adds a monotonic regression step before computing
the stress.
Parameters
----------
dissimilarities : array-like of shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Must be symmetric.
metric : bool, default=True
Compute metric or nonmetric SMACOF algorithm.
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
missing values.
n_components : int, default=2
Number of dimensions in which to immerse the dissimilarities. If an
``init`` array is provided, this option is overridden and the shape of
``init`` is used to determine the dimensionality of the embedding
space.
init : array-like of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the algorithm. By
default, the algorithm is initialized with a randomly chosen array.
n_init : int, default=8
Number of times the SMACOF algorithm will be run with different
initializations. The final results will be the best output of the runs,
determined by the run with the smallest final stress. If ``init`` is
provided, this option is overridden and a single run is performed.
n_jobs : int, default=None
The number of jobs to use for the computation. If multiple
initializations are used (``n_init``), each run of the algorithm is
computed in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, default=0
Level of verbosity.
eps : float, default=1e-3
Relative tolerance with respect to stress at which to declare
convergence. The value of `eps` should be tuned separately depending
on whether or not `normalized_stress` is being used.
random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
return_n_iter : bool, default=False
Whether or not to return the number of iterations.
normalized_stress : bool or "auto" default="auto"
Whether use and return normed stress value (Stress-1) instead of raw
stress calculated by default. Only supported in non-metric MDS.
.. versionadded:: 1.2
.. versionchanged:: 1.4
The default value changed from `False` to `"auto"` in version 1.4.
Returns
-------
X : ndarray of shape (n_samples, n_components)
Coordinates of the points in a ``n_components``-space.
stress : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
If `normalized_stress=True`, and `metric=False` returns Stress-1.
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
0.1 fair, and 0.2 poor [1]_.
n_iter : int
The number of iterations corresponding to the best stress. Returned
only if ``return_n_iter`` is set to ``True``.
References
----------
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
Examples
--------
>>> import numpy as np
>>> from sklearn.manifold import smacof
>>> from sklearn.metrics import euclidean_distances
>>> X = np.array([[0, 1, 2], [1, 0, 3],[2, 3, 0]])
>>> dissimilarities = euclidean_distances(X)
>>> mds_result, stress = smacof(dissimilarities, n_components=2, random_state=42)
>>> mds_result
array([[ 0.05... -1.07... ],
[ 1.74..., -0.75...],
[-1.79..., 1.83...]])
>>> stress
0.0012...
"""
dissimilarities = check_array(dissimilarities)
random_state = check_random_state(random_state)
if normalized_stress == "auto":
normalized_stress = not metric
if normalized_stress and metric:
raise ValueError(
"Normalized stress is not supported for metric MDS. Either set"
" `normalized_stress=False` or use `metric=False`."
)
if hasattr(init, "__array__"):
init = np.asarray(init).copy()
if not n_init == 1:
warnings.warn(
"Explicit initial positions passed: "
"performing only one init of the MDS instead of %d" % n_init
)
n_init = 1
best_pos, best_stress = None, None
if effective_n_jobs(n_jobs) == 1:
for it in range(n_init):
pos, stress, n_iter_ = _smacof_single(
dissimilarities,
metric=metric,
n_components=n_components,
init=init,
max_iter=max_iter,
verbose=verbose,
eps=eps,
random_state=random_state,
normalized_stress=normalized_stress,
)
if best_stress is None or stress < best_stress:
best_stress = stress
best_pos = pos.copy()
best_iter = n_iter_
else:
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
delayed(_smacof_single)(
dissimilarities,
metric=metric,
n_components=n_components,
init=init,
max_iter=max_iter,
verbose=verbose,
eps=eps,
random_state=seed,
normalized_stress=normalized_stress,
)
for seed in seeds
)
positions, stress, n_iters = zip(*results)
best = np.argmin(stress)
best_stress = stress[best]
best_pos = positions[best]
best_iter = n_iters[best]
if return_n_iter:
return best_pos, best_stress, best_iter
else:
return best_pos, best_stress
class MDS(BaseEstimator):
"""Multidimensional scaling.
Read more in the :ref:`User Guide <multidimensional_scaling>`.
Parameters
----------
n_components : int, default=2
Number of dimensions in which to immerse the dissimilarities.
metric : bool, default=True
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
missing values.
n_init : int, default=4
Number of times the SMACOF algorithm will be run with different
initializations. The final results will be the best output of the runs,
determined by the run with the smallest final stress.
max_iter : int, default=300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, default=0
Level of verbosity.
eps : float, default=1e-3
Relative tolerance with respect to stress at which to declare
convergence. The value of `eps` should be tuned separately depending
on whether or not `normalized_stress` is being used.
n_jobs : int, default=None
The number of jobs to use for the computation. If multiple
initializations are used (``n_init``), each run of the algorithm is
computed in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
random_state : int, RandomState instance or None, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
dissimilarity : {'euclidean', 'precomputed'}, default='euclidean'
Dissimilarity measure to use:
- 'euclidean':
Pairwise Euclidean distances between points in the dataset.
- 'precomputed':
Pre-computed dissimilarities are passed directly to ``fit`` and
``fit_transform``.
normalized_stress : bool or "auto" default="auto"
Whether use and return normed stress value (Stress-1) instead of raw
stress calculated by default. Only supported in non-metric MDS.
.. versionadded:: 1.2
.. versionchanged:: 1.4
The default value changed from `False` to `"auto"` in version 1.4.
Attributes
----------
embedding_ : ndarray of shape (n_samples, n_components)
Stores the position of the dataset in the embedding space.
stress_ : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
If `normalized_stress=True`, and `metric=False` returns Stress-1.
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
0.1 fair, and 0.2 poor [1]_.
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Symmetric matrix that:
- either uses a custom dissimilarity matrix by setting `dissimilarity`
to 'precomputed';
- or constructs a dissimilarity matrix from data using
Euclidean distances.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_iter_ : int
The number of iterations corresponding to the best stress.
See Also
--------
sklearn.decomposition.PCA : Principal component analysis that is a linear
dimensionality reduction method.
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
kernels and PCA.
TSNE : T-distributed Stochastic Neighbor Embedding.
Isomap : Manifold learning based on Isometric Mapping.
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
References
----------
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import MDS
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = MDS(n_components=2, normalized_stress='auto')
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
For a more detailed example of usage, see:
:ref:`sphx_glr_auto_examples_manifold_plot_mds.py`
"""
_parameter_constraints: dict = {
"n_components": [Interval(Integral, 1, None, closed="left")],
"metric": ["boolean"],
"n_init": [Interval(Integral, 1, None, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"verbose": ["verbose"],
"eps": [Interval(Real, 0.0, None, closed="left")],
"n_jobs": [None, Integral],
"random_state": ["random_state"],
"dissimilarity": [StrOptions({"euclidean", "precomputed"})],
"normalized_stress": ["boolean", StrOptions({"auto"})],
}
def __init__(
self,
n_components=2,
*,
metric=True,
n_init=4,
max_iter=300,
verbose=0,
eps=1e-3,
n_jobs=None,
random_state=None,
dissimilarity="euclidean",
normalized_stress="auto",
):
self.n_components = n_components
self.dissimilarity = dissimilarity
self.metric = metric
self.n_init = n_init
self.max_iter = max_iter
self.eps = eps
self.verbose = verbose
self.n_jobs = n_jobs
self.random_state = random_state
self.normalized_stress = normalized_stress
def _more_tags(self):
return {"pairwise": self.dissimilarity == "precomputed"}
def fit(self, X, y=None, init=None):
"""
Compute the position of the points in the embedding space.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
Input data. If ``dissimilarity=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
Not used, present for API consistency by convention.
init : ndarray of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the SMACOF
algorithm. By default, the algorithm is initialized with a randomly
chosen array.
Returns
-------
self : object
Fitted estimator.
"""
self.fit_transform(X, init=init)
return self
@_fit_context(prefer_skip_nested_validation=True)
def fit_transform(self, X, y=None, init=None):
"""
Fit the data from `X`, and returns the embedded coordinates.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
Input data. If ``dissimilarity=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
Not used, present for API consistency by convention.
init : ndarray of shape (n_samples, n_components), default=None
Starting configuration of the embedding to initialize the SMACOF
algorithm. By default, the algorithm is initialized with a randomly
chosen array.
Returns
-------
X_new : ndarray of shape (n_samples, n_components)
X transformed in the new space.
"""
X = self._validate_data(X)
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
warnings.warn(
"The MDS API has changed. ``fit`` now constructs an"
" dissimilarity matrix from data. To use a custom "
"dissimilarity matrix, set "
"``dissimilarity='precomputed'``."
)
if self.dissimilarity == "precomputed":
self.dissimilarity_matrix_ = X
elif self.dissimilarity == "euclidean":
self.dissimilarity_matrix_ = euclidean_distances(X)
self.embedding_, self.stress_, self.n_iter_ = smacof(
self.dissimilarity_matrix_,
metric=self.metric,
n_components=self.n_components,
init=init,
n_init=self.n_init,
n_jobs=self.n_jobs,
max_iter=self.max_iter,
verbose=self.verbose,
eps=self.eps,
random_state=self.random_state,
return_n_iter=True,
normalized_stress=self.normalized_stress,
)
return self.embedding_
@@ -0,0 +1,778 @@
"""Spectral Embedding."""
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
# Wei LI <kuantkid@gmail.com>
# License: BSD 3 clause
import warnings
from numbers import Integral, Real
import numpy as np
from scipy import sparse
from scipy.linalg import eigh
from scipy.sparse.csgraph import connected_components
from scipy.sparse.linalg import eigsh, lobpcg
from ..base import BaseEstimator, _fit_context
from ..metrics.pairwise import rbf_kernel
from ..neighbors import NearestNeighbors, kneighbors_graph
from ..utils import (
check_array,
check_random_state,
check_symmetric,
)
from ..utils._arpack import _init_arpack_v0
from ..utils._param_validation import Interval, StrOptions, validate_params
from ..utils.extmath import _deterministic_vector_sign_flip
from ..utils.fixes import laplacian as csgraph_laplacian
from ..utils.fixes import parse_version, sp_version
def _graph_connected_component(graph, node_id):
"""Find the largest graph connected components that contains one
given node.
Parameters
----------
graph : array-like of shape (n_samples, n_samples)
Adjacency matrix of the graph, non-zero weight means an edge
between the nodes.
node_id : int
The index of the query node of the graph.
Returns
-------
connected_components_matrix : array-like of shape (n_samples,)
An array of bool value indicating the indexes of the nodes
belonging to the largest connected components of the given query
node.
"""
n_node = graph.shape[0]
if sparse.issparse(graph):
# speed up row-wise access to boolean connection mask
graph = graph.tocsr()
connected_nodes = np.zeros(n_node, dtype=bool)
nodes_to_explore = np.zeros(n_node, dtype=bool)
nodes_to_explore[node_id] = True
for _ in range(n_node):
last_num_component = connected_nodes.sum()
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
if last_num_component >= connected_nodes.sum():
break
indices = np.where(nodes_to_explore)[0]
nodes_to_explore.fill(False)
for i in indices:
if sparse.issparse(graph):
# scipy not yet implemented 1D sparse slices; can be changed back to
# `neighbors = graph[i].toarray().ravel()` once implemented
neighbors = graph[[i], :].toarray().ravel()
else:
neighbors = graph[i]
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
return connected_nodes
def _graph_is_connected(graph):
"""Return whether the graph is connected (True) or Not (False).
Parameters
----------
graph : {array-like, sparse matrix} of shape (n_samples, n_samples)
Adjacency matrix of the graph, non-zero weight means an edge
between the nodes.
Returns
-------
is_connected : bool
True means the graph is fully connected and False means not.
"""
if sparse.issparse(graph):
# Before Scipy 1.11.3, `connected_components` only supports 32-bit indices.
# PR: https://github.com/scipy/scipy/pull/18913
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
# TODO(jjerphan): Once SciPy 1.11.3 is the minimum supported version, use
# `accept_large_sparse=True`.
accept_large_sparse = sp_version >= parse_version("1.11.3")
graph = check_array(
graph, accept_sparse=True, accept_large_sparse=accept_large_sparse
)
# sparse graph, find all the connected components
n_connected_components, _ = connected_components(graph)
return n_connected_components == 1
else:
# dense graph, find all connected components start from node 0
return _graph_connected_component(graph, 0).sum() == graph.shape[0]
def _set_diag(laplacian, value, norm_laplacian):
"""Set the diagonal of the laplacian matrix and convert it to a
sparse format well suited for eigenvalue decomposition.
Parameters
----------
laplacian : {ndarray, sparse matrix}
The graph laplacian.
value : float
The value of the diagonal.
norm_laplacian : bool
Whether the value of the diagonal should be changed or not.
Returns
-------
laplacian : {array, sparse matrix}
An array of matrix in a form that is well suited to fast
eigenvalue decomposition, depending on the band width of the
matrix.
"""
n_nodes = laplacian.shape[0]
# We need all entries in the diagonal to values
if not sparse.issparse(laplacian):
if norm_laplacian:
laplacian.flat[:: n_nodes + 1] = value
else:
laplacian = laplacian.tocoo()
if norm_laplacian:
diag_idx = laplacian.row == laplacian.col
laplacian.data[diag_idx] = value
# If the matrix has a small number of diagonals (as in the
# case of structured matrices coming from images), the
# dia format might be best suited for matvec products:
n_diags = np.unique(laplacian.row - laplacian.col).size
if n_diags <= 7:
# 3 or less outer diagonals on each side
laplacian = laplacian.todia()
else:
# csr has the fastest matvec and is thus best suited to
# arpack
laplacian = laplacian.tocsr()
return laplacian
@validate_params(
{
"adjacency": ["array-like", "sparse matrix"],
"n_components": [Interval(Integral, 1, None, closed="left")],
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
"random_state": ["random_state"],
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
"norm_laplacian": ["boolean"],
"drop_first": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def spectral_embedding(
adjacency,
*,
n_components=8,
eigen_solver=None,
random_state=None,
eigen_tol="auto",
norm_laplacian=True,
drop_first=True,
):
"""Project the sample on the first eigenvectors of the graph Laplacian.
The adjacency matrix is used to compute a normalized graph Laplacian
whose spectrum (especially the eigenvectors associated to the
smallest eigenvalues) has an interpretation in terms of minimal
number of cuts necessary to split the graph into comparably sized
components.
This embedding can also 'work' even if the ``adjacency`` variable is
not strictly the adjacency matrix of a graph but more generally
an affinity or similarity matrix between samples (for instance the
heat kernel of a euclidean distance matrix or a k-NN matrix).
However care must taken to always make the affinity matrix symmetric
so that the eigenvector decomposition works as expected.
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
Read more in the :ref:`User Guide <spectral_embedding>`.
Parameters
----------
adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)
The adjacency matrix of the graph to embed.
n_components : int, default=8
The dimension of the projection subspace.
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities. If None, then ``'arpack'`` is
used.
random_state : int, RandomState instance or None, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigen vectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
eigen_tol : float, default="auto"
Stopping criterion for eigendecomposition of the Laplacian matrix.
If `eigen_tol="auto"` then the passed tolerance will depend on the
`eigen_solver`:
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
`eigen_tol=None` which configures the underlying `lobpcg` solver to
automatically resolve the value according to their heuristics. See,
:func:`scipy.sparse.linalg.lobpcg` for details.
Note that when using `eigen_solver="amg"` values of `tol<1e-5` may lead
to convergence issues and should be avoided.
.. versionadded:: 1.2
Added 'auto' option.
norm_laplacian : bool, default=True
If True, then compute symmetric normalized Laplacian.
drop_first : bool, default=True
Whether to drop the first eigenvector. For spectral embedding, this
should be True as the first eigenvector should be constant vector for
connected graph, but for spectral clustering, this should be kept as
False to retain the first eigenvector.
Returns
-------
embedding : ndarray of shape (n_samples, n_components)
The reduced samples.
Notes
-----
Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
has one connected component. If there graph has many components, the first
few eigenvectors will simply uncover the connected components of the graph.
References
----------
* https://en.wikipedia.org/wiki/LOBPCG
* :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal
Block Preconditioned Conjugate Gradient Method",
Andrew V. Knyazev
<10.1137/S1064827500366124>`
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.neighbors import kneighbors_graph
>>> from sklearn.manifold import spectral_embedding
>>> X, _ = load_digits(return_X_y=True)
>>> X = X[:100]
>>> affinity_matrix = kneighbors_graph(
... X, n_neighbors=int(X.shape[0] / 10), include_self=True
... )
>>> # make the matrix symmetric
>>> affinity_matrix = 0.5 * (affinity_matrix + affinity_matrix.T)
>>> embedding = spectral_embedding(affinity_matrix, n_components=2, random_state=42)
>>> embedding.shape
(100, 2)
"""
random_state = check_random_state(random_state)
return _spectral_embedding(
adjacency,
n_components=n_components,
eigen_solver=eigen_solver,
random_state=random_state,
eigen_tol=eigen_tol,
norm_laplacian=norm_laplacian,
drop_first=drop_first,
)
def _spectral_embedding(
adjacency,
*,
n_components=8,
eigen_solver=None,
random_state=None,
eigen_tol="auto",
norm_laplacian=True,
drop_first=True,
):
adjacency = check_symmetric(adjacency)
if eigen_solver == "amg":
try:
from pyamg import smoothed_aggregation_solver
except ImportError as e:
raise ValueError(
"The eigen_solver was set to 'amg', but pyamg is not available."
) from e
if eigen_solver is None:
eigen_solver = "arpack"
n_nodes = adjacency.shape[0]
# Whether to drop the first eigenvector
if drop_first:
n_components = n_components + 1
if not _graph_is_connected(adjacency):
warnings.warn(
"Graph is not fully connected, spectral embedding may not work as expected."
)
laplacian, dd = csgraph_laplacian(
adjacency, normed=norm_laplacian, return_diag=True
)
if (
eigen_solver == "arpack"
or eigen_solver != "lobpcg"
and (not sparse.issparse(laplacian) or n_nodes < 5 * n_components)
):
# lobpcg used with eigen_solver='amg' has bugs for low number of nodes
# for details see the source code in scipy:
# https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
# /lobpcg/lobpcg.py#L237
# or matlab:
# https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# Here we'll use shift-invert mode for fast eigenvalues
# (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
# for a short explanation of what this means)
# Because the normalized Laplacian has eigenvalues between 0 and 2,
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
# when finding eigenvalues of largest magnitude (keyword which='LM')
# and when these eigenvalues are very large compared to the rest.
# For very large, very sparse graphs, I - L can have many, many
# eigenvalues very near 1.0. This leads to slow convergence. So
# instead, we'll use ARPACK's shift-invert mode, asking for the
# eigenvalues near 1.0. This effectively spreads-out the spectrum
# near 1.0 and leads to much faster convergence: potentially an
# orders-of-magnitude speedup over simply using keyword which='LA'
# in standard mode.
try:
# We are computing the opposite of the laplacian inplace so as
# to spare a memory allocation of a possibly very large array
tol = 0 if eigen_tol == "auto" else eigen_tol
laplacian *= -1
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
laplacian = check_array(
laplacian, accept_sparse="csr", accept_large_sparse=False
)
_, diffusion_map = eigsh(
laplacian, k=n_components, sigma=1.0, which="LM", tol=tol, v0=v0
)
embedding = diffusion_map.T[n_components::-1]
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
except RuntimeError:
# When submatrices are exactly singular, an LU decomposition
# in arpack fails. We fallback to lobpcg
eigen_solver = "lobpcg"
# Revert the laplacian to its opposite to have lobpcg work
laplacian *= -1
elif eigen_solver == "amg":
# Use AMG to get a preconditioner and speed up the eigenvalue
# problem.
if not sparse.issparse(laplacian):
warnings.warn("AMG works better for sparse matrices")
laplacian = check_array(
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
)
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# The Laplacian matrix is always singular, having at least one zero
# eigenvalue, corresponding to the trivial eigenvector, which is a
# constant. Using a singular matrix for preconditioning may result in
# random failures in LOBPCG and is not supported by the existing
# theory:
# see https://doi.org/10.1007/s10208-015-9297-1
# Shift the Laplacian so its diagononal is not all ones. The shift
# does change the eigenpairs however, so we'll feed the shifted
# matrix to the solver and afterward set it back to the original.
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
laplacian += diag_shift
if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array):
# `pyamg` does not work with `csr_array` and we need to convert it to a
# `csr_matrix` object.
laplacian = sparse.csr_matrix(laplacian)
ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr"))
laplacian -= diag_shift
M = ml.aspreconditioner()
# Create initial approximation X to eigenvectors
X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
X[:, 0] = dd.ravel()
X = X.astype(laplacian.dtype)
tol = None if eigen_tol == "auto" else eigen_tol
_, diffusion_map = lobpcg(laplacian, X, M=M, tol=tol, largest=False)
embedding = diffusion_map.T
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
if embedding.shape[0] == 1:
raise ValueError
if eigen_solver == "lobpcg":
laplacian = check_array(
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
)
if n_nodes < 5 * n_components + 1:
# see note above under arpack why lobpcg has problems with small
# number of nodes
# lobpcg will fallback to eigh, so we short circuit it
if sparse.issparse(laplacian):
laplacian = laplacian.toarray()
_, diffusion_map = eigh(laplacian, check_finite=False)
embedding = diffusion_map.T[:n_components]
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
else:
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# We increase the number of eigenvectors requested, as lobpcg
# doesn't behave well in low dimension and create initial
# approximation X to eigenvectors
X = random_state.standard_normal(
size=(laplacian.shape[0], n_components + 1)
)
X[:, 0] = dd.ravel()
X = X.astype(laplacian.dtype)
tol = None if eigen_tol == "auto" else eigen_tol
_, diffusion_map = lobpcg(
laplacian, X, tol=tol, largest=False, maxiter=2000
)
embedding = diffusion_map.T[:n_components]
if norm_laplacian:
# recover u = D^-1/2 x from the eigenvector output x
embedding = embedding / dd
if embedding.shape[0] == 1:
raise ValueError
embedding = _deterministic_vector_sign_flip(embedding)
if drop_first:
return embedding[1:n_components].T
else:
return embedding[:n_components].T
class SpectralEmbedding(BaseEstimator):
"""Spectral embedding for non-linear dimensionality reduction.
Forms an affinity matrix given by the specified function and
applies spectral decomposition to the corresponding graph laplacian.
The resulting transformation is given by the value of the
eigenvectors for each data point.
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
Read more in the :ref:`User Guide <spectral_embedding>`.
Parameters
----------
n_components : int, default=2
The dimension of the projected subspace.
affinity : {'nearest_neighbors', 'rbf', 'precomputed', \
'precomputed_nearest_neighbors'} or callable, \
default='nearest_neighbors'
How to construct the affinity matrix.
- 'nearest_neighbors' : construct the affinity matrix by computing a
graph of nearest neighbors.
- 'rbf' : construct the affinity matrix by computing a radial basis
function (RBF) kernel.
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
of precomputed nearest neighbors, and constructs the affinity matrix
by selecting the ``n_neighbors`` nearest neighbors.
- callable : use passed in function as affinity
the function takes in data matrix (n_samples, n_features)
and return affinity matrix (n_samples, n_samples).
gamma : float, default=None
Kernel coefficient for rbf kernel. If None, gamma will be set to
1/n_features.
random_state : int, RandomState instance or None, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigen vectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems.
If None, then ``'arpack'`` is used.
eigen_tol : float, default="auto"
Stopping criterion for eigendecomposition of the Laplacian matrix.
If `eigen_tol="auto"` then the passed tolerance will depend on the
`eigen_solver`:
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
`eigen_tol=None` which configures the underlying `lobpcg` solver to
automatically resolve the value according to their heuristics. See,
:func:`scipy.sparse.linalg.lobpcg` for details.
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
values of `tol<1e-5` may lead to convergence issues and should be
avoided.
.. versionadded:: 1.2
n_neighbors : int, default=None
Number of nearest neighbors for nearest_neighbors graph building.
If None, n_neighbors will be set to max(n_samples/10, 1).
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
embedding_ : ndarray of shape (n_samples, n_components)
Spectral embedding of the training matrix.
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
Affinity_matrix constructed from samples or precomputed.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_neighbors_ : int
Number of nearest neighbors effectively used.
See Also
--------
Isomap : Non-linear dimensionality reduction through Isometric Mapping.
References
----------
- :doi:`A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
<10.1007/s11222-007-9033-z>`
- `On Spectral Clustering: Analysis and an algorithm, 2001
Andrew Y. Ng, Michael I. Jordan, Yair Weiss
<https://citeseerx.ist.psu.edu/doc_view/pid/796c5d6336fc52aa84db575fb821c78918b65f58>`_
- :doi:`Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
<10.1109/34.868688>`
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import SpectralEmbedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = SpectralEmbedding(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
"""
_parameter_constraints: dict = {
"n_components": [Interval(Integral, 1, None, closed="left")],
"affinity": [
StrOptions(
{
"nearest_neighbors",
"rbf",
"precomputed",
"precomputed_nearest_neighbors",
},
),
callable,
],
"gamma": [Interval(Real, 0, None, closed="left"), None],
"random_state": ["random_state"],
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
"n_jobs": [None, Integral],
}
def __init__(
self,
n_components=2,
*,
affinity="nearest_neighbors",
gamma=None,
random_state=None,
eigen_solver=None,
eigen_tol="auto",
n_neighbors=None,
n_jobs=None,
):
self.n_components = n_components
self.affinity = affinity
self.gamma = gamma
self.random_state = random_state
self.eigen_solver = eigen_solver
self.eigen_tol = eigen_tol
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
def _more_tags(self):
return {
"pairwise": self.affinity
in [
"precomputed",
"precomputed_nearest_neighbors",
]
}
def _get_affinity_matrix(self, X, Y=None):
"""Calculate the affinity matrix from data
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
If affinity is "precomputed"
X : array-like of shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
Y: Ignored
Returns
-------
affinity_matrix of shape (n_samples, n_samples)
"""
if self.affinity == "precomputed":
self.affinity_matrix_ = X
return self.affinity_matrix_
if self.affinity == "precomputed_nearest_neighbors":
estimator = NearestNeighbors(
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
).fit(X)
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
return self.affinity_matrix_
if self.affinity == "nearest_neighbors":
if sparse.issparse(X):
warnings.warn(
"Nearest neighbors affinity currently does "
"not support sparse input, falling back to "
"rbf affinity"
)
self.affinity = "rbf"
else:
self.n_neighbors_ = (
self.n_neighbors
if self.n_neighbors is not None
else max(int(X.shape[0] / 10), 1)
)
self.affinity_matrix_ = kneighbors_graph(
X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs
)
# currently only symmetric affinity_matrix supported
self.affinity_matrix_ = 0.5 * (
self.affinity_matrix_ + self.affinity_matrix_.T
)
return self.affinity_matrix_
if self.affinity == "rbf":
self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
return self.affinity_matrix_
self.affinity_matrix_ = self.affinity(X)
return self.affinity_matrix_
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the model from data in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
If affinity is "precomputed"
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2)
random_state = check_random_state(self.random_state)
affinity_matrix = self._get_affinity_matrix(X)
self.embedding_ = _spectral_embedding(
affinity_matrix,
n_components=self.n_components,
eigen_solver=self.eigen_solver,
eigen_tol=self.eigen_tol,
random_state=random_state,
)
return self
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
If affinity is "precomputed"
X : {array-like, sparse matrix} of shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
X_new : array-like of shape (n_samples, n_components)
Spectral embedding of the training matrix.
"""
self.fit(X)
return self.embedding_
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,120 @@
import numpy as np
from libc cimport math
from libc.math cimport INFINITY
from ..utils._typedefs cimport float32_t, float64_t
cdef float EPSILON_DBL = 1e-8
cdef float PERPLEXITY_TOLERANCE = 1e-5
# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
def _binary_search_perplexity(
const float32_t[:, :] sqdistances,
float desired_perplexity,
int verbose):
"""Binary search for sigmas of conditional Gaussians.
This approximation reduces the computational complexity from O(N^2) to
O(uN).
Parameters
----------
sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
Distances between training samples and their k nearest neighbors.
When using the exact method, this is a square (n_samples, n_samples)
distance matrix. The TSNE default metric is "euclidean" which is
interpreted as squared euclidean distance.
desired_perplexity : float
Desired perplexity (2^entropy) of the conditional Gaussians.
verbose : int
Verbosity level.
Returns
-------
P : ndarray of shape (n_samples, n_samples), dtype=np.float64
Probabilities of conditional Gaussian distributions p_i|j.
"""
# Maximum number of binary search steps
cdef long n_steps = 100
cdef long n_samples = sqdistances.shape[0]
cdef long n_neighbors = sqdistances.shape[1]
cdef int using_neighbors = n_neighbors < n_samples
# Precisions of conditional Gaussian distributions
cdef double beta
cdef double beta_min
cdef double beta_max
cdef double beta_sum = 0.0
# Use log scale
cdef double desired_entropy = math.log(desired_perplexity)
cdef double entropy_diff
cdef double entropy
cdef double sum_Pi
cdef double sum_disti_Pi
cdef long i, j, l
# This array is later used as a 32bit array. It has multiple intermediate
# floating point additions that benefit from the extra precision
cdef float64_t[:, :] P = np.zeros(
(n_samples, n_neighbors), dtype=np.float64)
for i in range(n_samples):
beta_min = -INFINITY
beta_max = INFINITY
beta = 1.0
# Binary search of precision for i-th conditional distribution
for l in range(n_steps):
# Compute current entropy and corresponding probabilities
# computed just over the nearest neighbors or over all data
# if we're not using neighbors
sum_Pi = 0.0
for j in range(n_neighbors):
if j != i or using_neighbors:
P[i, j] = math.exp(-sqdistances[i, j] * beta)
sum_Pi += P[i, j]
if sum_Pi == 0.0:
sum_Pi = EPSILON_DBL
sum_disti_Pi = 0.0
for j in range(n_neighbors):
P[i, j] /= sum_Pi
sum_disti_Pi += sqdistances[i, j] * P[i, j]
entropy = math.log(sum_Pi) + beta * sum_disti_Pi
entropy_diff = entropy - desired_entropy
if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
break
if entropy_diff > 0.0:
beta_min = beta
if beta_max == INFINITY:
beta *= 2.0
else:
beta = (beta + beta_max) / 2.0
else:
beta_max = beta
if beta_min == -INFINITY:
beta /= 2.0
else:
beta = (beta + beta_min) / 2.0
beta_sum += beta
if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
print("[t-SNE] Computed conditional probabilities for sample "
"%d / %d" % (i + 1, n_samples))
if verbose:
print("[t-SNE] Mean sigma: %f"
% np.mean(math.sqrt(n_samples / beta_sum)))
return np.asarray(P)
@@ -0,0 +1,16 @@
py.extension_module(
'_utils',
['_utils.pyx', utils_cython_tree],
cython_args: cython_args,
subdir: 'sklearn/manifold',
install: true
)
py.extension_module(
'_barnes_hut_tsne',
'_barnes_hut_tsne.pyx',
dependencies: [np_dep],
cython_args: cython_args,
subdir: 'sklearn/manifold',
install: true
)
@@ -0,0 +1,348 @@
import math
from itertools import product
import numpy as np
import pytest
from scipy.sparse import rand as sparse_rand
from sklearn import clone, datasets, manifold, neighbors, pipeline, preprocessing
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils._testing import (
assert_allclose,
assert_allclose_dense_sparse,
assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS
eigen_solvers = ["auto", "dense", "arpack"]
path_methods = ["auto", "FW", "D"]
def create_sample_data(dtype, n_pts=25, add_noise=False):
# grid of equidistant points in 2D, n_components = n_dim
n_per_side = int(math.sqrt(n_pts))
X = np.array(list(product(range(n_per_side), repeat=2))).astype(dtype, copy=False)
if add_noise:
# add noise in a third dimension
rng = np.random.RandomState(0)
noise = 0.1 * rng.randn(n_pts, 1).astype(dtype, copy=False)
X = np.concatenate((X, noise), 1)
return X
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
def test_isomap_simple_grid(
global_dtype, n_neighbors, radius, eigen_solver, path_method
):
# Isomap should preserve distances when all neighbors are used
n_pts = 25
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=False)
# distances from each point to all others
if n_neighbors is not None:
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
else:
G = neighbors.radius_neighbors_graph(X, radius, mode="distance")
clf = manifold.Isomap(
n_neighbors=n_neighbors,
radius=radius,
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
)
clf.fit(X)
if n_neighbors is not None:
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
else:
G_iso = neighbors.radius_neighbors_graph(
clf.embedding_, radius, mode="distance"
)
atol = 1e-5 if global_dtype == np.float32 else 0
assert_allclose_dense_sparse(G, G_iso, atol=atol)
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
def test_isomap_reconstruction_error(
global_dtype, n_neighbors, radius, eigen_solver, path_method
):
if global_dtype is np.float32:
pytest.skip(
"Skipping test due to numerical instabilities on float32 data"
"from KernelCenterer used in the reconstruction_error method"
)
# Same setup as in test_isomap_simple_grid, with an added dimension
n_pts = 25
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=True)
# compute input kernel
if n_neighbors is not None:
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()
else:
G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray()
centerer = preprocessing.KernelCenterer()
K = centerer.fit_transform(-0.5 * G**2)
clf = manifold.Isomap(
n_neighbors=n_neighbors,
radius=radius,
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
)
clf.fit(X)
# compute output kernel
if n_neighbors is not None:
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
else:
G_iso = neighbors.radius_neighbors_graph(
clf.embedding_, radius, mode="distance"
)
G_iso = G_iso.toarray()
K_iso = centerer.fit_transform(-0.5 * G_iso**2)
# make sure error agrees
reconstruction_error = np.linalg.norm(K - K_iso) / n_pts
atol = 1e-5 if global_dtype == np.float32 else 0
assert_allclose(reconstruction_error, clf.reconstruction_error(), atol=atol)
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)])
def test_transform(global_dtype, n_neighbors, radius):
n_samples = 200
n_components = 10
noise_scale = 0.01
# Create S-curve dataset
X, y = datasets.make_s_curve(n_samples, random_state=0)
X = X.astype(global_dtype, copy=False)
# Compute isomap embedding
iso = manifold.Isomap(
n_components=n_components, n_neighbors=n_neighbors, radius=radius
)
X_iso = iso.fit_transform(X)
# Re-embed a noisy version of the points
rng = np.random.RandomState(0)
noise = noise_scale * rng.randn(*X.shape)
X_iso2 = iso.transform(X + noise)
# Make sure the rms error on re-embedding is comparable to noise_scale
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)])
def test_pipeline(n_neighbors, radius, global_dtype):
# check that Isomap works fine as a transformer in a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
X, y = datasets.make_blobs(random_state=0)
X = X.astype(global_dtype, copy=False)
clf = pipeline.Pipeline(
[
("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)),
("clf", neighbors.KNeighborsClassifier()),
]
)
clf.fit(X, y)
assert 0.9 < clf.score(X, y)
def test_pipeline_with_nearest_neighbors_transformer(global_dtype):
# Test chaining NearestNeighborsTransformer and Isomap with
# neighbors_algorithm='precomputed'
algorithm = "auto"
n_neighbors = 10
X, _ = datasets.make_blobs(random_state=0)
X2, _ = datasets.make_blobs(random_state=1)
X = X.astype(global_dtype, copy=False)
X2 = X2.astype(global_dtype, copy=False)
# compare the chained version and the compact version
est_chain = pipeline.make_pipeline(
neighbors.KNeighborsTransformer(
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
),
manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"),
)
est_compact = manifold.Isomap(
n_neighbors=n_neighbors, neighbors_algorithm=algorithm
)
Xt_chain = est_chain.fit_transform(X)
Xt_compact = est_compact.fit_transform(X)
assert_allclose(Xt_chain, Xt_compact)
Xt_chain = est_chain.transform(X2)
Xt_compact = est_compact.transform(X2)
assert_allclose(Xt_chain, Xt_compact)
@pytest.mark.parametrize(
"metric, p, is_euclidean",
[
("euclidean", 2, True),
("manhattan", 1, False),
("minkowski", 1, False),
("minkowski", 2, True),
(lambda x1, x2: np.sqrt(np.sum(x1**2 + x2**2)), 2, False),
],
)
def test_different_metric(global_dtype, metric, p, is_euclidean):
# Isomap must work on various metric parameters work correctly
# and must default to euclidean.
X, _ = datasets.make_blobs(random_state=0)
X = X.astype(global_dtype, copy=False)
reference = manifold.Isomap().fit_transform(X)
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
if is_euclidean:
assert_allclose(embedding, reference)
else:
with pytest.raises(AssertionError, match="Not equal to tolerance"):
assert_allclose(embedding, reference)
def test_isomap_clone_bug():
# regression test for bug reported in #6062
model = manifold.Isomap()
for n_neighbors in [10, 15, 20]:
model.set_params(n_neighbors=n_neighbors)
model.fit(np.random.rand(50, 2))
assert model.nbrs_.n_neighbors == n_neighbors
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_input(
global_dtype, eigen_solver, path_method, global_random_seed, csr_container
):
# TODO: compare results on dense and sparse data as proposed in:
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
X = csr_container(
sparse_rand(
100,
3,
density=0.1,
format="csr",
dtype=global_dtype,
random_state=global_random_seed,
)
)
iso_dense = manifold.Isomap(
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
n_neighbors=8,
)
iso_sparse = clone(iso_dense)
X_trans_dense = iso_dense.fit_transform(X.toarray())
X_trans_sparse = iso_sparse.fit_transform(X)
assert_allclose(X_trans_sparse, X_trans_dense, rtol=1e-4, atol=1e-4)
def test_isomap_fit_precomputed_radius_graph(global_dtype):
# Isomap.fit_transform must yield similar result when using
# a precomputed distance matrix.
X, y = datasets.make_s_curve(200, random_state=0)
X = X.astype(global_dtype, copy=False)
radius = 10
g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance")
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed")
isomap.fit(g)
precomputed_result = isomap.embedding_
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski")
result = isomap.fit_transform(X)
atol = 1e-5 if global_dtype == np.float32 else 0
assert_allclose(precomputed_result, result, atol=atol)
def test_isomap_fitted_attributes_dtype(global_dtype):
"""Check that the fitted attributes are stored accordingly to the
data type of X."""
iso = manifold.Isomap(n_neighbors=2)
X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
iso.fit(X)
assert iso.dist_matrix_.dtype == global_dtype
assert iso.embedding_.dtype == global_dtype
def test_isomap_dtype_equivalence():
"""Check the equivalence of the results with 32 and 64 bits input."""
iso_32 = manifold.Isomap(n_neighbors=2)
X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
iso_32.fit(X_32)
iso_64 = manifold.Isomap(n_neighbors=2)
X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
iso_64.fit(X_64)
assert_allclose(iso_32.dist_matrix_, iso_64.dist_matrix_)
def test_isomap_raise_error_when_neighbor_and_radius_both_set():
# Isomap.fit_transform must raise a ValueError if
# radius and n_neighbors are provided.
X, _ = datasets.load_digits(return_X_y=True)
isomap = manifold.Isomap(n_neighbors=3, radius=5.5)
msg = "Both n_neighbors and radius are provided"
with pytest.raises(ValueError, match=msg):
isomap.fit_transform(X)
def test_multiple_connected_components():
# Test that a warning is raised when the graph has multiple components
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
with pytest.warns(UserWarning, match="number of connected components"):
manifold.Isomap(n_neighbors=2).fit(X)
def test_multiple_connected_components_metric_precomputed(global_dtype):
# Test that an error is raised when the graph has multiple components
# and when X is a precomputed neighbors graph.
X = np.array([0, 1, 2, 5, 6, 7])[:, None].astype(global_dtype, copy=False)
# works with a precomputed distance matrix (dense)
X_distances = pairwise_distances(X)
with pytest.warns(UserWarning, match="number of connected components"):
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances)
# does not work with a precomputed neighbors graph (sparse)
X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance")
with pytest.raises(RuntimeError, match="number of connected components"):
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph)
def test_get_feature_names_out():
"""Check get_feature_names_out for Isomap."""
X, y = make_blobs(random_state=0, n_features=4)
n_components = 2
iso = manifold.Isomap(n_components=n_components)
iso.fit_transform(X)
names = iso.get_feature_names_out()
assert_array_equal([f"isomap{i}" for i in range(n_components)], names)
@@ -0,0 +1,171 @@
from itertools import product
import numpy as np
import pytest
from scipy import linalg
from sklearn import manifold, neighbors
from sklearn.datasets import make_blobs
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
from sklearn.utils._testing import (
assert_allclose,
assert_array_equal,
ignore_warnings,
)
eigen_solvers = ["dense", "arpack"]
# ----------------------------------------------------------------------
# Test utility routines
def test_barycenter_kneighbors_graph(global_dtype):
X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype)
graph = barycenter_kneighbors_graph(X, 1)
expected_graph = np.array(
[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype
)
assert graph.dtype == global_dtype
assert_allclose(graph.toarray(), expected_graph)
graph = barycenter_kneighbors_graph(X, 2)
# check that columns sum to one
assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3))
pred = np.dot(graph.toarray(), X)
assert linalg.norm(pred - X) / X.shape[0] < 1
# ----------------------------------------------------------------------
# Test LLE by computing the reconstruction error on some manifolds.
def test_lle_simple_grid(global_dtype):
# note: ARPACK is numerically unstable, so this test will fail for
# some random seeds. We choose 42 because the tests pass.
# for arm64 platforms 2 makes the test fail.
# TODO: rewrite this test to make less sensitive to the random seed,
# irrespective of the platform.
rng = np.random.RandomState(42)
# grid of equidistant points in 2D, n_components = n_dim
X = np.array(list(product(range(5), repeat=2)))
X = X + 1e-10 * rng.uniform(size=X.shape)
X = X.astype(global_dtype, copy=False)
n_components = 2
clf = manifold.LocallyLinearEmbedding(
n_neighbors=5, n_components=n_components, random_state=rng
)
tol = 0.1
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro")
assert reconstruction_error < tol
for solver in eigen_solvers:
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = (
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
)
assert reconstruction_error < tol
assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1)
# re-embed a noisy version of X using the transform method
noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100
X_reembedded = clf.transform(X + noise)
assert linalg.norm(X_reembedded - clf.embedding_) < tol
@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"])
@pytest.mark.parametrize("solver", eigen_solvers)
def test_lle_manifold(global_dtype, method, solver):
rng = np.random.RandomState(0)
# similar test on a slightly more complex manifold
X = np.array(list(product(np.arange(18), repeat=2)))
X = np.c_[X, X[:, 0] ** 2 / 18]
X = X + 1e-10 * rng.uniform(size=X.shape)
X = X.astype(global_dtype, copy=False)
n_components = 2
clf = manifold.LocallyLinearEmbedding(
n_neighbors=6, n_components=n_components, method=method, random_state=0
)
tol = 1.5 if method == "standard" else 3
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X)
assert reconstruction_error < tol
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = (
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
)
details = "solver: %s, method: %s" % (solver, method)
assert reconstruction_error < tol, details
assert (
np.abs(clf.reconstruction_error_ - reconstruction_error)
< tol * reconstruction_error
), details
def test_pipeline():
# check that LocallyLinearEmbedding works fine as a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
from sklearn import datasets, pipeline
X, y = datasets.make_blobs(random_state=0)
clf = pipeline.Pipeline(
[
("filter", manifold.LocallyLinearEmbedding(random_state=0)),
("clf", neighbors.KNeighborsClassifier()),
]
)
clf.fit(X, y)
assert 0.9 < clf.score(X, y)
# Test the error raised when the weight matrix is singular
def test_singular_matrix():
M = np.ones((200, 3))
f = ignore_warnings
with pytest.raises(ValueError, match="Error in determining null-space with ARPACK"):
f(
manifold.locally_linear_embedding(
M,
n_neighbors=2,
n_components=1,
method="standard",
eigen_solver="arpack",
)
)
# regression test for #6033
def test_integer_input():
rand = np.random.RandomState(0)
X = rand.randint(0, 100, size=(20, 3))
for method in ["standard", "hessian", "modified", "ltsa"]:
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
clf.fit(X) # this previously raised a TypeError
def test_get_feature_names_out():
"""Check get_feature_names_out for LocallyLinearEmbedding."""
X, y = make_blobs(random_state=0, n_features=4)
n_components = 2
iso = manifold.LocallyLinearEmbedding(n_components=n_components)
iso.fit(X)
names = iso.get_feature_names_out()
assert_array_equal(
[f"locallylinearembedding{i}" for i in range(n_components)], names
)
@@ -0,0 +1,87 @@
from unittest.mock import Mock
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_almost_equal
from sklearn.manifold import _mds as mds
from sklearn.metrics import euclidean_distances
def test_smacof():
# test metric smacof using the data of "Modern Multidimensional Scaling",
# Borg & Groenen, p 154
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
X_true = np.array(
[[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]]
)
assert_array_almost_equal(X, X_true, decimal=3)
def test_smacof_error():
# Not symmetric similarity matrix:
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim)
# Not squared similarity matrix:
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim)
# init not None and not correct format:
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]])
with pytest.raises(ValueError):
mds.smacof(sim, init=Z, n_init=1)
def test_MDS():
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed")
mds_clf.fit(sim)
@pytest.mark.parametrize("k", [0.5, 1.5, 2])
def test_normed_stress(k):
"""Test that non-metric MDS normalized stress is scale-invariant."""
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
X1, stress1 = mds.smacof(sim, metric=False, max_iter=5, random_state=0)
X2, stress2 = mds.smacof(k * sim, metric=False, max_iter=5, random_state=0)
assert_allclose(stress1, stress2, rtol=1e-5)
assert_allclose(X1, X2, rtol=1e-5)
def test_normalize_metric_warning():
"""
Test that a UserWarning is emitted when using normalized stress with
metric-MDS.
"""
msg = "Normalized stress is not supported"
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
with pytest.raises(ValueError, match=msg):
mds.smacof(sim, metric=True, normalized_stress=True)
@pytest.mark.parametrize("metric", [True, False])
def test_normalized_stress_auto(metric, monkeypatch):
rng = np.random.RandomState(0)
X = rng.randn(4, 3)
dist = euclidean_distances(X)
mock = Mock(side_effect=mds._smacof_single)
monkeypatch.setattr("sklearn.manifold._mds._smacof_single", mock)
est = mds.MDS(metric=metric, normalized_stress="auto", random_state=rng)
est.fit_transform(X)
assert mock.call_args[1]["normalized_stress"] != metric
mds.smacof(dist, metric=metric, normalized_stress="auto", random_state=rng)
assert mock.call_args[1]["normalized_stress"] != metric
@@ -0,0 +1,541 @@
from unittest.mock import Mock
import numpy as np
import pytest
from scipy import sparse
from scipy.linalg import eigh
from scipy.sparse.linalg import eigsh, lobpcg
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.manifold import SpectralEmbedding, _spectral_embedding, spectral_embedding
from sklearn.manifold._spectral_embedding import (
_graph_connected_component,
_graph_is_connected,
)
from sklearn.metrics import normalized_mutual_info_score, pairwise_distances
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
parse_version,
sp_version,
)
from sklearn.utils.fixes import laplacian as csgraph_laplacian
try:
from pyamg import smoothed_aggregation_solver # noqa
pyamg_available = True
except ImportError:
pyamg_available = False
skip_if_no_pyamg = pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
# non centered, sparse centers to check the
centers = np.array(
[
[0.0, 5.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 4.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 1.0],
]
)
n_samples = 1000
n_clusters, n_features = centers.shape
S, true_labels = make_blobs(
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
)
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
"""Check array A and B are equal with possible sign flipping on
each columns"""
tol_squared = tol**2
for A_col, B_col in zip(A.T, B.T):
assert (
np.max((A_col - B_col) ** 2) <= tol_squared
or np.max((A_col + B_col) ** 2) <= tol_squared
)
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
def test_sparse_graph_connected_component(coo_container):
rng = np.random.RandomState(42)
n_samples = 300
boundaries = [0, 42, 121, 200, n_samples]
p = rng.permutation(n_samples)
connections = []
for start, stop in zip(boundaries[:-1], boundaries[1:]):
group = p[start:stop]
# Connect all elements within the group at least once via an
# arbitrary path that spans the group.
for i in range(len(group) - 1):
connections.append((group[i], group[i + 1]))
# Add some more random connections within the group
min_idx, max_idx = 0, len(group) - 1
n_random_connections = 1000
source = rng.randint(min_idx, max_idx, size=n_random_connections)
target = rng.randint(min_idx, max_idx, size=n_random_connections)
connections.extend(zip(group[source], group[target]))
# Build a symmetric affinity matrix
row_idx, column_idx = tuple(np.array(connections).T)
data = rng.uniform(0.1, 42, size=len(connections))
affinity = coo_container((data, (row_idx, column_idx)))
affinity = 0.5 * (affinity + affinity.T)
for start, stop in zip(boundaries[:-1], boundaries[1:]):
component_1 = _graph_connected_component(affinity, p[start])
component_size = stop - start
assert component_1.sum() == component_size
# We should retrieve the same component mask by starting by both ends
# of the group
component_2 = _graph_connected_component(affinity, p[stop - 1])
assert component_2.sum() == component_size
assert_array_equal(component_1, component_2)
# TODO: investigate why this test is seed-sensitive on 32-bit Python
# runtimes. Is this revealing a numerical stability problem ? Or is it
# expected from the test numerical design ? In the latter case the test
# should be made less seed-sensitive instead.
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_two_components(eigen_solver, dtype, seed=0):
# Test spectral embedding with two components
random_state = np.random.RandomState(seed)
n_sample = 100
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
# first component
affinity[0:n_sample, 0:n_sample] = (
np.abs(random_state.randn(n_sample, n_sample)) + 2
)
# second component
affinity[n_sample::, n_sample::] = (
np.abs(random_state.randn(n_sample, n_sample)) + 2
)
# Test of internal _graph_connected_component before connection
component = _graph_connected_component(affinity, 0)
assert component[:n_sample].all()
assert not component[n_sample:].any()
component = _graph_connected_component(affinity, -1)
assert not component[:n_sample].any()
assert component[n_sample:].all()
# connection
affinity[0, n_sample + 1] = 1
affinity[n_sample + 1, 0] = 1
affinity.flat[:: 2 * n_sample + 1] = 0
affinity = 0.5 * (affinity + affinity.T)
true_label = np.zeros(shape=2 * n_sample)
true_label[0:n_sample] = 1
se_precomp = SpectralEmbedding(
n_components=1,
affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
# thresholding on the first components using 0.
label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_precomputed_affinity(
sparse_container, eigen_solver, dtype, seed=36
):
# Test spectral embedding with precomputed kernel
gamma = 1.0
X = S if sparse_container is None else sparse_container(S)
se_precomp = SpectralEmbedding(
n_components=2,
affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
se_rbf = SpectralEmbedding(
n_components=2,
affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
embed_rbf = se_rbf.fit_transform(X.astype(dtype))
assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
def test_precomputed_nearest_neighbors_filtering():
# Test precomputed graph filtering when containing too many neighbors
n_neighbors = 2
results = []
for additional_neighbors in [0, 10]:
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
graph = nn.kneighbors_graph(S, mode="connectivity")
embedding = (
SpectralEmbedding(
random_state=0,
n_components=2,
affinity="precomputed_nearest_neighbors",
n_neighbors=n_neighbors,
)
.fit(graph)
.embedding_
)
results.append(embedding)
assert_array_equal(results[0], results[1])
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
def test_spectral_embedding_callable_affinity(sparse_container, seed=36):
# Test spectral embedding with callable affinity
gamma = 0.9
kern = rbf_kernel(S, gamma=gamma)
X = S if sparse_container is None else sparse_container(S)
se_callable = SpectralEmbedding(
n_components=2,
affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
gamma=gamma,
random_state=np.random.RandomState(seed),
)
se_rbf = SpectralEmbedding(
n_components=2,
affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed),
)
embed_rbf = se_rbf.fit_transform(X)
embed_callable = se_callable.fit_transform(X)
assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.float
@pytest.mark.filterwarnings(
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of pinv2
@pytest.mark.filterwarnings(
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.filterwarnings(
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36):
se_amg = SpectralEmbedding(
n_components=2,
affinity="nearest_neighbors",
eigen_solver="amg",
n_neighbors=5,
random_state=np.random.RandomState(seed),
)
se_arpack = SpectralEmbedding(
n_components=2,
affinity="nearest_neighbors",
eigen_solver="arpack",
n_neighbors=5,
random_state=np.random.RandomState(seed),
)
embed_amg = se_amg.fit_transform(S.astype(dtype))
embed_arpack = se_arpack.fit_transform(S.astype(dtype))
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# same with special case in which amg is not actually used
# regression test for #10715
# affinity between nodes
row = np.array([0, 0, 1, 2, 3, 3, 4], dtype=np.int32)
col = np.array([1, 2, 2, 3, 4, 5, 5], dtype=np.int32)
val = np.array([100, 100, 100, 1, 100, 100, 100], dtype=np.int64)
affinity = coo_container(
(np.hstack([val, val]), (np.hstack([row, col]), np.hstack([col, row]))),
shape=(6, 6),
)
se_amg.affinity = "precomputed"
se_arpack.affinity = "precomputed"
embed_amg = se_amg.fit_transform(affinity.astype(dtype))
embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# Check that passing a sparse matrix with `np.int64` indices dtype raises an error
# or is successful based on the version of SciPy which is installed.
# Use a CSR matrix to avoid any conversion during the validation
affinity = affinity.tocsr()
affinity.indptr = affinity.indptr.astype(np.int64)
affinity.indices = affinity.indices.astype(np.int64)
# PR: https://github.com/scipy/scipy/pull/18913
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
scipy_graph_traversal_supports_int64_index = sp_version >= parse_version("1.11.3")
if scipy_graph_traversal_supports_int64_index:
se_amg.fit_transform(affinity)
else:
err_msg = "Only sparse matrices with 32-bit integer indices are accepted"
with pytest.raises(ValueError, match=err_msg):
se_amg.fit_transform(affinity)
# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with
# np.random.rand:
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.float
@pytest.mark.filterwarnings(
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of pinv2
@pytest.mark.filterwarnings(
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
# TODO: Remove when pyamg removes the use of np.find_common_type
@pytest.mark.filterwarnings(
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
# Non-regression test for amg solver failure (issue #13393 on github)
num_nodes = 100
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
X = X.astype(dtype)
upper = sparse.triu(X) - sparse.diags(X.diagonal())
sym_matrix = upper + upper.T
embedding = spectral_embedding(
sym_matrix, n_components=10, eigen_solver="amg", random_state=0
)
# Check that the learned embedding is stable w.r.t. random solver init:
for i in range(3):
new_embedding = spectral_embedding(
sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
)
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
@pytest.mark.filterwarnings("ignore:the behavior of nmi will change in version 0.22")
def test_pipeline_spectral_clustering(seed=36):
# Test using pipeline to do spectral clustering
random_state = np.random.RandomState(seed)
se_rbf = SpectralEmbedding(
n_components=n_clusters, affinity="rbf", random_state=random_state
)
se_knn = SpectralEmbedding(
n_components=n_clusters,
affinity="nearest_neighbors",
n_neighbors=5,
random_state=random_state,
)
for se in [se_rbf, se_knn]:
km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
km.fit(se.fit_transform(S))
assert_array_almost_equal(
normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
)
def test_connectivity(seed=36):
# Test that graph connectivity test works as expected
graph = np.array(
[
[1, 0, 0, 0, 0],
[0, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
]
)
assert not _graph_is_connected(graph)
for csr_container in CSR_CONTAINERS:
assert not _graph_is_connected(csr_container(graph))
for csc_container in CSC_CONTAINERS:
assert not _graph_is_connected(csc_container(graph))
graph = np.array(
[
[1, 1, 0, 0, 0],
[1, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
]
)
assert _graph_is_connected(graph)
for csr_container in CSR_CONTAINERS:
assert _graph_is_connected(csr_container(graph))
for csc_container in CSC_CONTAINERS:
assert _graph_is_connected(csc_container(graph))
def test_spectral_embedding_deterministic():
# Test that Spectral Embedding is deterministic
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
embedding_1 = spectral_embedding(sims)
embedding_2 = spectral_embedding(sims)
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_unnormalized():
# Test that spectral_embedding is also processing unnormalized laplacian
# correctly
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 8
embedding_1 = spectral_embedding(
sims, norm_laplacian=False, n_components=n_components, drop_first=False
)
# Verify using manual computation with dense eigh
laplacian, dd = csgraph_laplacian(sims, normed=False, return_diag=True)
_, diffusion_map = eigh(laplacian)
embedding_2 = diffusion_map.T[:n_components]
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_first_eigen_vector():
# Test that the first eigenvector of spectral_embedding
# is constant and that the second is not (for a connected graph)
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 2
for seed in range(10):
embedding = spectral_embedding(
sims,
norm_laplacian=False,
n_components=n_components,
drop_first=False,
random_state=seed,
)
assert np.std(embedding[:, 0]) == pytest.approx(0)
assert np.std(embedding[:, 1]) > 1e-3
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
"""Check that `SpectralEmbedding is preserving the dtype of the fitted
attribute and transformed data.
Ideally, this test should be covered by the common test
`check_transformer_preserve_dtypes`. However, this test only run
with transformers implementing `transform` while `SpectralEmbedding`
implements only `fit_transform`.
"""
X = S.astype(dtype)
se = SpectralEmbedding(
n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
)
X_trans = se.fit_transform(X)
assert X_trans.dtype == dtype
assert se.embedding_.dtype == dtype
assert se.affinity_matrix_.dtype == dtype
@pytest.mark.skipif(
pyamg_available,
reason="PyAMG is installed and we should not test for an error.",
)
def test_error_pyamg_not_available():
se_precomp = SpectralEmbedding(
n_components=2,
affinity="rbf",
eigen_solver="amg",
)
err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
with pytest.raises(ValueError, match=err_msg):
se_precomp.fit_transform(S)
# TODO: Remove when pyamg removes the use of np.find_common_type
@pytest.mark.filterwarnings(
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_spectral_eigen_tol_auto(monkeypatch, solver, csr_container):
"""Test that `eigen_tol="auto"` is resolved correctly"""
if solver == "amg" and not pyamg_available:
pytest.skip("PyAMG is not available.")
X, _ = make_blobs(
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
solver_func = eigsh if solver == "arpack" else lobpcg
default_value = 0 if solver == "arpack" else None
if solver == "amg":
S = csr_container(S)
mocked_solver = Mock(side_effect=solver_func)
monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver)
spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto")
mocked_solver.assert_called()
_, kwargs = mocked_solver.call_args
assert kwargs["tol"] == default_value
File diff suppressed because it is too large Load Diff