feat: initial commit - Phase 1 & 2 core features

This commit is contained in:
hiderfong
2026-04-22 17:07:33 +08:00
commit 1773bda06b
25005 changed files with 6252106 additions and 0 deletions
@@ -0,0 +1,348 @@
import math
from itertools import product
import numpy as np
import pytest
from scipy.sparse import rand as sparse_rand
from sklearn import clone, datasets, manifold, neighbors, pipeline, preprocessing
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils._testing import (
assert_allclose,
assert_allclose_dense_sparse,
assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS
eigen_solvers = ["auto", "dense", "arpack"]
path_methods = ["auto", "FW", "D"]
def create_sample_data(dtype, n_pts=25, add_noise=False):
# grid of equidistant points in 2D, n_components = n_dim
n_per_side = int(math.sqrt(n_pts))
X = np.array(list(product(range(n_per_side), repeat=2))).astype(dtype, copy=False)
if add_noise:
# add noise in a third dimension
rng = np.random.RandomState(0)
noise = 0.1 * rng.randn(n_pts, 1).astype(dtype, copy=False)
X = np.concatenate((X, noise), 1)
return X
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
def test_isomap_simple_grid(
global_dtype, n_neighbors, radius, eigen_solver, path_method
):
# Isomap should preserve distances when all neighbors are used
n_pts = 25
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=False)
# distances from each point to all others
if n_neighbors is not None:
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
else:
G = neighbors.radius_neighbors_graph(X, radius, mode="distance")
clf = manifold.Isomap(
n_neighbors=n_neighbors,
radius=radius,
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
)
clf.fit(X)
if n_neighbors is not None:
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
else:
G_iso = neighbors.radius_neighbors_graph(
clf.embedding_, radius, mode="distance"
)
atol = 1e-5 if global_dtype == np.float32 else 0
assert_allclose_dense_sparse(G, G_iso, atol=atol)
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
def test_isomap_reconstruction_error(
global_dtype, n_neighbors, radius, eigen_solver, path_method
):
if global_dtype is np.float32:
pytest.skip(
"Skipping test due to numerical instabilities on float32 data"
"from KernelCenterer used in the reconstruction_error method"
)
# Same setup as in test_isomap_simple_grid, with an added dimension
n_pts = 25
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=True)
# compute input kernel
if n_neighbors is not None:
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()
else:
G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray()
centerer = preprocessing.KernelCenterer()
K = centerer.fit_transform(-0.5 * G**2)
clf = manifold.Isomap(
n_neighbors=n_neighbors,
radius=radius,
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
)
clf.fit(X)
# compute output kernel
if n_neighbors is not None:
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
else:
G_iso = neighbors.radius_neighbors_graph(
clf.embedding_, radius, mode="distance"
)
G_iso = G_iso.toarray()
K_iso = centerer.fit_transform(-0.5 * G_iso**2)
# make sure error agrees
reconstruction_error = np.linalg.norm(K - K_iso) / n_pts
atol = 1e-5 if global_dtype == np.float32 else 0
assert_allclose(reconstruction_error, clf.reconstruction_error(), atol=atol)
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)])
def test_transform(global_dtype, n_neighbors, radius):
n_samples = 200
n_components = 10
noise_scale = 0.01
# Create S-curve dataset
X, y = datasets.make_s_curve(n_samples, random_state=0)
X = X.astype(global_dtype, copy=False)
# Compute isomap embedding
iso = manifold.Isomap(
n_components=n_components, n_neighbors=n_neighbors, radius=radius
)
X_iso = iso.fit_transform(X)
# Re-embed a noisy version of the points
rng = np.random.RandomState(0)
noise = noise_scale * rng.randn(*X.shape)
X_iso2 = iso.transform(X + noise)
# Make sure the rms error on re-embedding is comparable to noise_scale
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)])
def test_pipeline(n_neighbors, radius, global_dtype):
# check that Isomap works fine as a transformer in a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
X, y = datasets.make_blobs(random_state=0)
X = X.astype(global_dtype, copy=False)
clf = pipeline.Pipeline(
[
("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)),
("clf", neighbors.KNeighborsClassifier()),
]
)
clf.fit(X, y)
assert 0.9 < clf.score(X, y)
def test_pipeline_with_nearest_neighbors_transformer(global_dtype):
# Test chaining NearestNeighborsTransformer and Isomap with
# neighbors_algorithm='precomputed'
algorithm = "auto"
n_neighbors = 10
X, _ = datasets.make_blobs(random_state=0)
X2, _ = datasets.make_blobs(random_state=1)
X = X.astype(global_dtype, copy=False)
X2 = X2.astype(global_dtype, copy=False)
# compare the chained version and the compact version
est_chain = pipeline.make_pipeline(
neighbors.KNeighborsTransformer(
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
),
manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"),
)
est_compact = manifold.Isomap(
n_neighbors=n_neighbors, neighbors_algorithm=algorithm
)
Xt_chain = est_chain.fit_transform(X)
Xt_compact = est_compact.fit_transform(X)
assert_allclose(Xt_chain, Xt_compact)
Xt_chain = est_chain.transform(X2)
Xt_compact = est_compact.transform(X2)
assert_allclose(Xt_chain, Xt_compact)
@pytest.mark.parametrize(
"metric, p, is_euclidean",
[
("euclidean", 2, True),
("manhattan", 1, False),
("minkowski", 1, False),
("minkowski", 2, True),
(lambda x1, x2: np.sqrt(np.sum(x1**2 + x2**2)), 2, False),
],
)
def test_different_metric(global_dtype, metric, p, is_euclidean):
# Isomap must work on various metric parameters work correctly
# and must default to euclidean.
X, _ = datasets.make_blobs(random_state=0)
X = X.astype(global_dtype, copy=False)
reference = manifold.Isomap().fit_transform(X)
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
if is_euclidean:
assert_allclose(embedding, reference)
else:
with pytest.raises(AssertionError, match="Not equal to tolerance"):
assert_allclose(embedding, reference)
def test_isomap_clone_bug():
# regression test for bug reported in #6062
model = manifold.Isomap()
for n_neighbors in [10, 15, 20]:
model.set_params(n_neighbors=n_neighbors)
model.fit(np.random.rand(50, 2))
assert model.nbrs_.n_neighbors == n_neighbors
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_input(
global_dtype, eigen_solver, path_method, global_random_seed, csr_container
):
# TODO: compare results on dense and sparse data as proposed in:
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
X = csr_container(
sparse_rand(
100,
3,
density=0.1,
format="csr",
dtype=global_dtype,
random_state=global_random_seed,
)
)
iso_dense = manifold.Isomap(
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
n_neighbors=8,
)
iso_sparse = clone(iso_dense)
X_trans_dense = iso_dense.fit_transform(X.toarray())
X_trans_sparse = iso_sparse.fit_transform(X)
assert_allclose(X_trans_sparse, X_trans_dense, rtol=1e-4, atol=1e-4)
def test_isomap_fit_precomputed_radius_graph(global_dtype):
# Isomap.fit_transform must yield similar result when using
# a precomputed distance matrix.
X, y = datasets.make_s_curve(200, random_state=0)
X = X.astype(global_dtype, copy=False)
radius = 10
g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance")
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed")
isomap.fit(g)
precomputed_result = isomap.embedding_
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski")
result = isomap.fit_transform(X)
atol = 1e-5 if global_dtype == np.float32 else 0
assert_allclose(precomputed_result, result, atol=atol)
def test_isomap_fitted_attributes_dtype(global_dtype):
"""Check that the fitted attributes are stored accordingly to the
data type of X."""
iso = manifold.Isomap(n_neighbors=2)
X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
iso.fit(X)
assert iso.dist_matrix_.dtype == global_dtype
assert iso.embedding_.dtype == global_dtype
def test_isomap_dtype_equivalence():
"""Check the equivalence of the results with 32 and 64 bits input."""
iso_32 = manifold.Isomap(n_neighbors=2)
X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
iso_32.fit(X_32)
iso_64 = manifold.Isomap(n_neighbors=2)
X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
iso_64.fit(X_64)
assert_allclose(iso_32.dist_matrix_, iso_64.dist_matrix_)
def test_isomap_raise_error_when_neighbor_and_radius_both_set():
# Isomap.fit_transform must raise a ValueError if
# radius and n_neighbors are provided.
X, _ = datasets.load_digits(return_X_y=True)
isomap = manifold.Isomap(n_neighbors=3, radius=5.5)
msg = "Both n_neighbors and radius are provided"
with pytest.raises(ValueError, match=msg):
isomap.fit_transform(X)
def test_multiple_connected_components():
# Test that a warning is raised when the graph has multiple components
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
with pytest.warns(UserWarning, match="number of connected components"):
manifold.Isomap(n_neighbors=2).fit(X)
def test_multiple_connected_components_metric_precomputed(global_dtype):
# Test that an error is raised when the graph has multiple components
# and when X is a precomputed neighbors graph.
X = np.array([0, 1, 2, 5, 6, 7])[:, None].astype(global_dtype, copy=False)
# works with a precomputed distance matrix (dense)
X_distances = pairwise_distances(X)
with pytest.warns(UserWarning, match="number of connected components"):
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances)
# does not work with a precomputed neighbors graph (sparse)
X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance")
with pytest.raises(RuntimeError, match="number of connected components"):
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph)
def test_get_feature_names_out():
"""Check get_feature_names_out for Isomap."""
X, y = make_blobs(random_state=0, n_features=4)
n_components = 2
iso = manifold.Isomap(n_components=n_components)
iso.fit_transform(X)
names = iso.get_feature_names_out()
assert_array_equal([f"isomap{i}" for i in range(n_components)], names)
@@ -0,0 +1,171 @@
from itertools import product
import numpy as np
import pytest
from scipy import linalg
from sklearn import manifold, neighbors
from sklearn.datasets import make_blobs
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
from sklearn.utils._testing import (
assert_allclose,
assert_array_equal,
ignore_warnings,
)
eigen_solvers = ["dense", "arpack"]
# ----------------------------------------------------------------------
# Test utility routines
def test_barycenter_kneighbors_graph(global_dtype):
X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype)
graph = barycenter_kneighbors_graph(X, 1)
expected_graph = np.array(
[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype
)
assert graph.dtype == global_dtype
assert_allclose(graph.toarray(), expected_graph)
graph = barycenter_kneighbors_graph(X, 2)
# check that columns sum to one
assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3))
pred = np.dot(graph.toarray(), X)
assert linalg.norm(pred - X) / X.shape[0] < 1
# ----------------------------------------------------------------------
# Test LLE by computing the reconstruction error on some manifolds.
def test_lle_simple_grid(global_dtype):
# note: ARPACK is numerically unstable, so this test will fail for
# some random seeds. We choose 42 because the tests pass.
# for arm64 platforms 2 makes the test fail.
# TODO: rewrite this test to make less sensitive to the random seed,
# irrespective of the platform.
rng = np.random.RandomState(42)
# grid of equidistant points in 2D, n_components = n_dim
X = np.array(list(product(range(5), repeat=2)))
X = X + 1e-10 * rng.uniform(size=X.shape)
X = X.astype(global_dtype, copy=False)
n_components = 2
clf = manifold.LocallyLinearEmbedding(
n_neighbors=5, n_components=n_components, random_state=rng
)
tol = 0.1
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro")
assert reconstruction_error < tol
for solver in eigen_solvers:
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = (
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
)
assert reconstruction_error < tol
assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1)
# re-embed a noisy version of X using the transform method
noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100
X_reembedded = clf.transform(X + noise)
assert linalg.norm(X_reembedded - clf.embedding_) < tol
@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"])
@pytest.mark.parametrize("solver", eigen_solvers)
def test_lle_manifold(global_dtype, method, solver):
rng = np.random.RandomState(0)
# similar test on a slightly more complex manifold
X = np.array(list(product(np.arange(18), repeat=2)))
X = np.c_[X, X[:, 0] ** 2 / 18]
X = X + 1e-10 * rng.uniform(size=X.shape)
X = X.astype(global_dtype, copy=False)
n_components = 2
clf = manifold.LocallyLinearEmbedding(
n_neighbors=6, n_components=n_components, method=method, random_state=0
)
tol = 1.5 if method == "standard" else 3
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X)
assert reconstruction_error < tol
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = (
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
)
details = "solver: %s, method: %s" % (solver, method)
assert reconstruction_error < tol, details
assert (
np.abs(clf.reconstruction_error_ - reconstruction_error)
< tol * reconstruction_error
), details
def test_pipeline():
# check that LocallyLinearEmbedding works fine as a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
from sklearn import datasets, pipeline
X, y = datasets.make_blobs(random_state=0)
clf = pipeline.Pipeline(
[
("filter", manifold.LocallyLinearEmbedding(random_state=0)),
("clf", neighbors.KNeighborsClassifier()),
]
)
clf.fit(X, y)
assert 0.9 < clf.score(X, y)
# Test the error raised when the weight matrix is singular
def test_singular_matrix():
M = np.ones((200, 3))
f = ignore_warnings
with pytest.raises(ValueError, match="Error in determining null-space with ARPACK"):
f(
manifold.locally_linear_embedding(
M,
n_neighbors=2,
n_components=1,
method="standard",
eigen_solver="arpack",
)
)
# regression test for #6033
def test_integer_input():
rand = np.random.RandomState(0)
X = rand.randint(0, 100, size=(20, 3))
for method in ["standard", "hessian", "modified", "ltsa"]:
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
clf.fit(X) # this previously raised a TypeError
def test_get_feature_names_out():
"""Check get_feature_names_out for LocallyLinearEmbedding."""
X, y = make_blobs(random_state=0, n_features=4)
n_components = 2
iso = manifold.LocallyLinearEmbedding(n_components=n_components)
iso.fit(X)
names = iso.get_feature_names_out()
assert_array_equal(
[f"locallylinearembedding{i}" for i in range(n_components)], names
)
@@ -0,0 +1,87 @@
from unittest.mock import Mock
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_almost_equal
from sklearn.manifold import _mds as mds
from sklearn.metrics import euclidean_distances
def test_smacof():
# test metric smacof using the data of "Modern Multidimensional Scaling",
# Borg & Groenen, p 154
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
X_true = np.array(
[[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]]
)
assert_array_almost_equal(X, X_true, decimal=3)
def test_smacof_error():
# Not symmetric similarity matrix:
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim)
# Not squared similarity matrix:
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim)
# init not None and not correct format:
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]])
with pytest.raises(ValueError):
mds.smacof(sim, init=Z, n_init=1)
def test_MDS():
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed")
mds_clf.fit(sim)
@pytest.mark.parametrize("k", [0.5, 1.5, 2])
def test_normed_stress(k):
"""Test that non-metric MDS normalized stress is scale-invariant."""
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
X1, stress1 = mds.smacof(sim, metric=False, max_iter=5, random_state=0)
X2, stress2 = mds.smacof(k * sim, metric=False, max_iter=5, random_state=0)
assert_allclose(stress1, stress2, rtol=1e-5)
assert_allclose(X1, X2, rtol=1e-5)
def test_normalize_metric_warning():
"""
Test that a UserWarning is emitted when using normalized stress with
metric-MDS.
"""
msg = "Normalized stress is not supported"
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
with pytest.raises(ValueError, match=msg):
mds.smacof(sim, metric=True, normalized_stress=True)
@pytest.mark.parametrize("metric", [True, False])
def test_normalized_stress_auto(metric, monkeypatch):
rng = np.random.RandomState(0)
X = rng.randn(4, 3)
dist = euclidean_distances(X)
mock = Mock(side_effect=mds._smacof_single)
monkeypatch.setattr("sklearn.manifold._mds._smacof_single", mock)
est = mds.MDS(metric=metric, normalized_stress="auto", random_state=rng)
est.fit_transform(X)
assert mock.call_args[1]["normalized_stress"] != metric
mds.smacof(dist, metric=metric, normalized_stress="auto", random_state=rng)
assert mock.call_args[1]["normalized_stress"] != metric
@@ -0,0 +1,541 @@
from unittest.mock import Mock
import numpy as np
import pytest
from scipy import sparse
from scipy.linalg import eigh
from scipy.sparse.linalg import eigsh, lobpcg
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.manifold import SpectralEmbedding, _spectral_embedding, spectral_embedding
from sklearn.manifold._spectral_embedding import (
_graph_connected_component,
_graph_is_connected,
)
from sklearn.metrics import normalized_mutual_info_score, pairwise_distances
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
parse_version,
sp_version,
)
from sklearn.utils.fixes import laplacian as csgraph_laplacian
try:
from pyamg import smoothed_aggregation_solver # noqa
pyamg_available = True
except ImportError:
pyamg_available = False
skip_if_no_pyamg = pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
# non centered, sparse centers to check the
centers = np.array(
[
[0.0, 5.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 4.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 1.0],
]
)
n_samples = 1000
n_clusters, n_features = centers.shape
S, true_labels = make_blobs(
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
)
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
"""Check array A and B are equal with possible sign flipping on
each columns"""
tol_squared = tol**2
for A_col, B_col in zip(A.T, B.T):
assert (
np.max((A_col - B_col) ** 2) <= tol_squared
or np.max((A_col + B_col) ** 2) <= tol_squared
)
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
def test_sparse_graph_connected_component(coo_container):
rng = np.random.RandomState(42)
n_samples = 300
boundaries = [0, 42, 121, 200, n_samples]
p = rng.permutation(n_samples)
connections = []
for start, stop in zip(boundaries[:-1], boundaries[1:]):
group = p[start:stop]
# Connect all elements within the group at least once via an
# arbitrary path that spans the group.
for i in range(len(group) - 1):
connections.append((group[i], group[i + 1]))
# Add some more random connections within the group
min_idx, max_idx = 0, len(group) - 1
n_random_connections = 1000
source = rng.randint(min_idx, max_idx, size=n_random_connections)
target = rng.randint(min_idx, max_idx, size=n_random_connections)
connections.extend(zip(group[source], group[target]))
# Build a symmetric affinity matrix
row_idx, column_idx = tuple(np.array(connections).T)
data = rng.uniform(0.1, 42, size=len(connections))
affinity = coo_container((data, (row_idx, column_idx)))
affinity = 0.5 * (affinity + affinity.T)
for start, stop in zip(boundaries[:-1], boundaries[1:]):
component_1 = _graph_connected_component(affinity, p[start])
component_size = stop - start
assert component_1.sum() == component_size
# We should retrieve the same component mask by starting by both ends
# of the group
component_2 = _graph_connected_component(affinity, p[stop - 1])
assert component_2.sum() == component_size
assert_array_equal(component_1, component_2)
# TODO: investigate why this test is seed-sensitive on 32-bit Python
# runtimes. Is this revealing a numerical stability problem ? Or is it
# expected from the test numerical design ? In the latter case the test
# should be made less seed-sensitive instead.
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_two_components(eigen_solver, dtype, seed=0):
# Test spectral embedding with two components
random_state = np.random.RandomState(seed)
n_sample = 100
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
# first component
affinity[0:n_sample, 0:n_sample] = (
np.abs(random_state.randn(n_sample, n_sample)) + 2
)
# second component
affinity[n_sample::, n_sample::] = (
np.abs(random_state.randn(n_sample, n_sample)) + 2
)
# Test of internal _graph_connected_component before connection
component = _graph_connected_component(affinity, 0)
assert component[:n_sample].all()
assert not component[n_sample:].any()
component = _graph_connected_component(affinity, -1)
assert not component[:n_sample].any()
assert component[n_sample:].all()
# connection
affinity[0, n_sample + 1] = 1
affinity[n_sample + 1, 0] = 1
affinity.flat[:: 2 * n_sample + 1] = 0
affinity = 0.5 * (affinity + affinity.T)
true_label = np.zeros(shape=2 * n_sample)
true_label[0:n_sample] = 1
se_precomp = SpectralEmbedding(
n_components=1,
affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
# thresholding on the first components using 0.
label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_precomputed_affinity(
sparse_container, eigen_solver, dtype, seed=36
):
# Test spectral embedding with precomputed kernel
gamma = 1.0
X = S if sparse_container is None else sparse_container(S)
se_precomp = SpectralEmbedding(
n_components=2,
affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
se_rbf = SpectralEmbedding(
n_components=2,
affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed),
eigen_solver=eigen_solver,
)
embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
embed_rbf = se_rbf.fit_transform(X.astype(dtype))
assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
def test_precomputed_nearest_neighbors_filtering():
# Test precomputed graph filtering when containing too many neighbors
n_neighbors = 2
results = []
for additional_neighbors in [0, 10]:
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
graph = nn.kneighbors_graph(S, mode="connectivity")
embedding = (
SpectralEmbedding(
random_state=0,
n_components=2,
affinity="precomputed_nearest_neighbors",
n_neighbors=n_neighbors,
)
.fit(graph)
.embedding_
)
results.append(embedding)
assert_array_equal(results[0], results[1])
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
def test_spectral_embedding_callable_affinity(sparse_container, seed=36):
# Test spectral embedding with callable affinity
gamma = 0.9
kern = rbf_kernel(S, gamma=gamma)
X = S if sparse_container is None else sparse_container(S)
se_callable = SpectralEmbedding(
n_components=2,
affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
gamma=gamma,
random_state=np.random.RandomState(seed),
)
se_rbf = SpectralEmbedding(
n_components=2,
affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed),
)
embed_rbf = se_rbf.fit_transform(X)
embed_callable = se_callable.fit_transform(X)
assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.float
@pytest.mark.filterwarnings(
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of pinv2
@pytest.mark.filterwarnings(
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.filterwarnings(
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36):
se_amg = SpectralEmbedding(
n_components=2,
affinity="nearest_neighbors",
eigen_solver="amg",
n_neighbors=5,
random_state=np.random.RandomState(seed),
)
se_arpack = SpectralEmbedding(
n_components=2,
affinity="nearest_neighbors",
eigen_solver="arpack",
n_neighbors=5,
random_state=np.random.RandomState(seed),
)
embed_amg = se_amg.fit_transform(S.astype(dtype))
embed_arpack = se_arpack.fit_transform(S.astype(dtype))
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# same with special case in which amg is not actually used
# regression test for #10715
# affinity between nodes
row = np.array([0, 0, 1, 2, 3, 3, 4], dtype=np.int32)
col = np.array([1, 2, 2, 3, 4, 5, 5], dtype=np.int32)
val = np.array([100, 100, 100, 1, 100, 100, 100], dtype=np.int64)
affinity = coo_container(
(np.hstack([val, val]), (np.hstack([row, col]), np.hstack([col, row]))),
shape=(6, 6),
)
se_amg.affinity = "precomputed"
se_arpack.affinity = "precomputed"
embed_amg = se_amg.fit_transform(affinity.astype(dtype))
embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# Check that passing a sparse matrix with `np.int64` indices dtype raises an error
# or is successful based on the version of SciPy which is installed.
# Use a CSR matrix to avoid any conversion during the validation
affinity = affinity.tocsr()
affinity.indptr = affinity.indptr.astype(np.int64)
affinity.indices = affinity.indices.astype(np.int64)
# PR: https://github.com/scipy/scipy/pull/18913
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
scipy_graph_traversal_supports_int64_index = sp_version >= parse_version("1.11.3")
if scipy_graph_traversal_supports_int64_index:
se_amg.fit_transform(affinity)
else:
err_msg = "Only sparse matrices with 32-bit integer indices are accepted"
with pytest.raises(ValueError, match=err_msg):
se_amg.fit_transform(affinity)
# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with
# np.random.rand:
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.float
@pytest.mark.filterwarnings(
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of pinv2
@pytest.mark.filterwarnings(
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.skipif(
not pyamg_available, reason="PyAMG is required for the tests in this function."
)
# TODO: Remove when pyamg removes the use of np.find_common_type
@pytest.mark.filterwarnings(
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
# Non-regression test for amg solver failure (issue #13393 on github)
num_nodes = 100
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
X = X.astype(dtype)
upper = sparse.triu(X) - sparse.diags(X.diagonal())
sym_matrix = upper + upper.T
embedding = spectral_embedding(
sym_matrix, n_components=10, eigen_solver="amg", random_state=0
)
# Check that the learned embedding is stable w.r.t. random solver init:
for i in range(3):
new_embedding = spectral_embedding(
sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
)
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
@pytest.mark.filterwarnings("ignore:the behavior of nmi will change in version 0.22")
def test_pipeline_spectral_clustering(seed=36):
# Test using pipeline to do spectral clustering
random_state = np.random.RandomState(seed)
se_rbf = SpectralEmbedding(
n_components=n_clusters, affinity="rbf", random_state=random_state
)
se_knn = SpectralEmbedding(
n_components=n_clusters,
affinity="nearest_neighbors",
n_neighbors=5,
random_state=random_state,
)
for se in [se_rbf, se_knn]:
km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
km.fit(se.fit_transform(S))
assert_array_almost_equal(
normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
)
def test_connectivity(seed=36):
# Test that graph connectivity test works as expected
graph = np.array(
[
[1, 0, 0, 0, 0],
[0, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
]
)
assert not _graph_is_connected(graph)
for csr_container in CSR_CONTAINERS:
assert not _graph_is_connected(csr_container(graph))
for csc_container in CSC_CONTAINERS:
assert not _graph_is_connected(csc_container(graph))
graph = np.array(
[
[1, 1, 0, 0, 0],
[1, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
]
)
assert _graph_is_connected(graph)
for csr_container in CSR_CONTAINERS:
assert _graph_is_connected(csr_container(graph))
for csc_container in CSC_CONTAINERS:
assert _graph_is_connected(csc_container(graph))
def test_spectral_embedding_deterministic():
# Test that Spectral Embedding is deterministic
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
embedding_1 = spectral_embedding(sims)
embedding_2 = spectral_embedding(sims)
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_unnormalized():
# Test that spectral_embedding is also processing unnormalized laplacian
# correctly
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 8
embedding_1 = spectral_embedding(
sims, norm_laplacian=False, n_components=n_components, drop_first=False
)
# Verify using manual computation with dense eigh
laplacian, dd = csgraph_laplacian(sims, normed=False, return_diag=True)
_, diffusion_map = eigh(laplacian)
embedding_2 = diffusion_map.T[:n_components]
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_first_eigen_vector():
# Test that the first eigenvector of spectral_embedding
# is constant and that the second is not (for a connected graph)
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 2
for seed in range(10):
embedding = spectral_embedding(
sims,
norm_laplacian=False,
n_components=n_components,
drop_first=False,
random_state=seed,
)
assert np.std(embedding[:, 0]) == pytest.approx(0)
assert np.std(embedding[:, 1]) > 1e-3
@pytest.mark.parametrize(
"eigen_solver",
[
"arpack",
"lobpcg",
pytest.param("amg", marks=skip_if_no_pyamg),
],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
"""Check that `SpectralEmbedding is preserving the dtype of the fitted
attribute and transformed data.
Ideally, this test should be covered by the common test
`check_transformer_preserve_dtypes`. However, this test only run
with transformers implementing `transform` while `SpectralEmbedding`
implements only `fit_transform`.
"""
X = S.astype(dtype)
se = SpectralEmbedding(
n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
)
X_trans = se.fit_transform(X)
assert X_trans.dtype == dtype
assert se.embedding_.dtype == dtype
assert se.affinity_matrix_.dtype == dtype
@pytest.mark.skipif(
pyamg_available,
reason="PyAMG is installed and we should not test for an error.",
)
def test_error_pyamg_not_available():
se_precomp = SpectralEmbedding(
n_components=2,
affinity="rbf",
eigen_solver="amg",
)
err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
with pytest.raises(ValueError, match=err_msg):
se_precomp.fit_transform(S)
# TODO: Remove when pyamg removes the use of np.find_common_type
@pytest.mark.filterwarnings(
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_spectral_eigen_tol_auto(monkeypatch, solver, csr_container):
"""Test that `eigen_tol="auto"` is resolved correctly"""
if solver == "amg" and not pyamg_available:
pytest.skip("PyAMG is not available.")
X, _ = make_blobs(
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
solver_func = eigsh if solver == "arpack" else lobpcg
default_value = 0 if solver == "arpack" else None
if solver == "amg":
S = csr_container(S)
mocked_solver = Mock(side_effect=solver_func)
monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver)
spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto")
mocked_solver.assert_called()
_, kwargs = mocked_solver.call_args
assert kwargs["tol"] == default_value
File diff suppressed because it is too large Load Diff