feat: initial commit - Phase 1 & 2 core features

2026-04-22 17:07:33 +08:00
commit 1773bda06b
25005 changed files with 6252106 additions and 0 deletions
@@ -0,0 +1,51 @@
+"""Module to give helpful messages to the user that did not
+compile scikit-learn properly.
+"""
+
+import os
+
+INPLACE_MSG = """
+It appears that you are importing a local scikit-learn source tree. For
+this, you need to have an inplace install. Maybe you are in the source
+directory and you need to try from another location."""
+
+STANDARD_MSG = """
+If you have used an installer, please check that it is suited for your
+Python version, your operating system and your platform."""
+
+
+def raise_build_error(e):
+    # Raise a comprehensible error and list the contents of the
+    # directory to help debugging on the mailing list.
+    local_dir = os.path.split(__file__)[0]
+    msg = STANDARD_MSG
+    if local_dir == "sklearn/__check_build":
+        # Picking up the local install: this will work only if the
+        # install is an 'inplace build'
+        msg = INPLACE_MSG
+    dir_content = list()
+    for i, filename in enumerate(os.listdir(local_dir)):
+        if (i + 1) % 3:
+            dir_content.append(filename.ljust(26))
+        else:
+            dir_content.append(filename + "\n")
+    raise ImportError(
+        """%s
+___________________________________________________________________________
+Contents of %s:
+%s
+___________________________________________________________________________
+It seems that scikit-learn has not been built correctly.
+
+If you have installed scikit-learn from source, please do not forget
+to build the package before using it: run `python setup.py install` or
+`make` in the source directory.
+%s"""
+        % (e, local_dir, "".join(dir_content).strip(), msg)
+    )
+
+
+try:
+    from ._check_build import check_build  # noqa
+except ImportError as e:
+    raise_build_error(e)
@@ -0,0 +1,2 @@
+def check_build():
+    return
@@ -0,0 +1,7 @@
+py.extension_module(
+  '_check_build',
+  '_check_build.pyx',
+  cython_args: cython_args,
+  install: true,
+  subdir: 'sklearn/__check_build',
+)
@@ -0,0 +1,172 @@
+"""
+The :mod:`sklearn` module includes functions to configure global settings and
+get information about the working environment.
+"""
+
+# Machine learning module for Python
+# ==================================
+#
+# sklearn is a Python module integrating classical machine
+# learning algorithms in the tightly-knit world of scientific Python
+# packages (numpy, scipy, matplotlib).
+#
+# It aims to provide simple and efficient solutions to learning problems
+# that are accessible to everybody and reusable in various contexts:
+# machine-learning as a versatile tool for science and engineering.
+#
+# See https://scikit-learn.org for complete documentation.
+
+import logging
+import os
+import random
+import sys
+
+from ._config import config_context, get_config, set_config
+
+logger = logging.getLogger(__name__)
+
+
+# PEP0440 compatible formatted version, see:
+# https://www.python.org/dev/peps/pep-0440/
+#
+# Generic release markers:
+#   X.Y.0   # For first release after an increment in Y
+#   X.Y.Z   # For bugfix releases
+#
+# Admissible pre-release markers:
+#   X.Y.ZaN   # Alpha release
+#   X.Y.ZbN   # Beta release
+#   X.Y.ZrcN  # Release Candidate
+#   X.Y.Z     # Final release
+#
+# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
+# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
+#
+__version__ = "1.5.0"
+
+
+# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
+# simultaneously. This can happen for instance when calling BLAS inside a
+# prange. Setting the following environment variable allows multiple OpenMP
+# libraries to be loaded. It should not degrade performances since we manually
+# take care of potential over-subcription performance issues, in sections of
+# the code where nested OpenMP loops can happen, by dynamically reconfiguring
+# the inner OpenMP runtime to temporarily disable it while under the scope of
+# the outer OpenMP parallel section.
+os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
+
+# Workaround issue discovered in intel-openmp 2019.5:
+# https://github.com/ContinuumIO/anaconda-issues/issues/11294
+os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
+
+try:
+    # This variable is injected in the __builtins__ by the build
+    # process. It is used to enable importing subpackages of sklearn when
+    # the binaries are not built
+    # mypy error: Cannot determine type of '__SKLEARN_SETUP__'
+    __SKLEARN_SETUP__  # type: ignore
+except NameError:
+    __SKLEARN_SETUP__ = False
+
+if __SKLEARN_SETUP__:
+    sys.stderr.write("Partial import of sklearn during the build process.\n")
+    # We are not importing the rest of scikit-learn during the build
+    # process, as it may not be compiled yet
+else:
+    # Import numpy, scipy to make sure that the BLAS libs are loaded before
+    # creating the ThreadpoolController. They would be imported just after
+    # when importing utils anyway. This makes it explicit and robust to changes
+    # in utils.
+    # (OpenMP is loaded by importing show_versions right after this block)
+    import numpy  # noqa
+    import scipy.linalg  # noqa
+    from threadpoolctl import ThreadpoolController
+
+    # `_distributor_init` allows distributors to run custom init code.
+    # For instance, for the Windows wheel, this is used to pre-load the
+    # vcomp shared library runtime for OpenMP embedded in the sklearn/.libs
+    # sub-folder.
+    # It is necessary to do this prior to importing show_versions as the
+    # later is linked to the OpenMP runtime to make it possible to introspect
+    # it and importing it first would fail if the OpenMP dll cannot be found.
+    from . import (
+        __check_build,  # noqa: F401
+        _distributor_init,  # noqa: F401
+    )
+    from .base import clone
+    from .utils._show_versions import show_versions
+
+    __all__ = [
+        "calibration",
+        "cluster",
+        "covariance",
+        "cross_decomposition",
+        "datasets",
+        "decomposition",
+        "dummy",
+        "ensemble",
+        "exceptions",
+        "experimental",
+        "externals",
+        "feature_extraction",
+        "feature_selection",
+        "gaussian_process",
+        "inspection",
+        "isotonic",
+        "kernel_approximation",
+        "kernel_ridge",
+        "linear_model",
+        "manifold",
+        "metrics",
+        "mixture",
+        "model_selection",
+        "multiclass",
+        "multioutput",
+        "naive_bayes",
+        "neighbors",
+        "neural_network",
+        "pipeline",
+        "preprocessing",
+        "random_projection",
+        "semi_supervised",
+        "svm",
+        "tree",
+        "discriminant_analysis",
+        "impute",
+        "compose",
+        # Non-modules:
+        "clone",
+        "get_config",
+        "set_config",
+        "config_context",
+        "show_versions",
+    ]
+
+    _BUILT_WITH_MESON = False
+    try:
+        import sklearn._built_with_meson  # noqa: F401
+
+        _BUILT_WITH_MESON = True
+    except ModuleNotFoundError:
+        pass
+
+    # Set a global controller that can be used to locally limit the number of
+    # threads without looping through all shared libraries every time.
+    # This instantitation should not happen earlier because it needs all BLAS and
+    # OpenMP libs to be loaded first.
+    _threadpool_controller = ThreadpoolController()
+
+
+def setup_module(module):
+    """Fixture for the tests to assure globally controllable seeding of RNGs"""
+
+    import numpy as np
+
+    # Check if a random seed exists in the environment, if not create one.
+    _random_seed = os.environ.get("SKLEARN_SEED", None)
+    if _random_seed is None:
+        _random_seed = np.random.uniform() * np.iinfo(np.int32).max
+    _random_seed = int(_random_seed)
+    print("I: Seeding RNGs with %r" % _random_seed)
+    np.random.seed(_random_seed)
+    random.seed(_random_seed)
@@ -0,0 +1,116 @@
+"""
+Utilities useful during the build.
+"""
+
+# author: Andy Mueller, Gael Varoquaux
+# license: BSD
+
+
+import contextlib
+import os
+
+import sklearn
+
+from .._min_dependencies import CYTHON_MIN_VERSION
+from ..externals._packaging.version import parse
+from .openmp_helpers import check_openmp_support
+from .pre_build_helpers import basic_check_build
+
+DEFAULT_ROOT = "sklearn"
+
+
+def _check_cython_version():
+    message = (
+        "Please install Cython with a version >= {0} in order "
+        "to build a scikit-learn from source."
+    ).format(CYTHON_MIN_VERSION)
+    try:
+        import Cython
+    except ModuleNotFoundError as e:
+        # Re-raise with more informative error message instead:
+        raise ModuleNotFoundError(message) from e
+
+    if parse(Cython.__version__) < parse(CYTHON_MIN_VERSION):
+        message += " The current version of Cython is {} installed in {}.".format(
+            Cython.__version__, Cython.__path__
+        )
+        raise ValueError(message)
+
+
+def cythonize_extensions(extension):
+    """Check that a recent Cython is available and cythonize extensions"""
+    _check_cython_version()
+    from Cython.Build import cythonize
+
+    # Fast fail before cythonization if compiler fails compiling basic test
+    # code even without OpenMP
+    basic_check_build()
+
+    # check simple compilation with OpenMP. If it fails scikit-learn will be
+    # built without OpenMP and the test test_openmp_supported in the test suite
+    # will fail.
+    # `check_openmp_support` compiles a small test program to see if the
+    # compilers are properly configured to build with OpenMP. This is expensive
+    # and we only want to call this function once.
+    # The result of this check is cached as a private attribute on the sklearn
+    # module (only at build-time) to be used in the build_ext subclass defined
+    # in the top-level setup.py file to actually build the compiled extensions
+    # with OpenMP flags if needed.
+    sklearn._OPENMP_SUPPORTED = check_openmp_support()
+
+    n_jobs = 1
+    with contextlib.suppress(ImportError):
+        import joblib
+
+        n_jobs = joblib.cpu_count()
+
+    # Additional checks for Cython
+    cython_enable_debug_directives = (
+        os.environ.get("SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES", "0") != "0"
+    )
+
+    compiler_directives = {
+        "language_level": 3,
+        "boundscheck": cython_enable_debug_directives,
+        "wraparound": False,
+        "initializedcheck": False,
+        "nonecheck": False,
+        "cdivision": True,
+        "profile": False,
+    }
+
+    return cythonize(
+        extension,
+        nthreads=n_jobs,
+        compiler_directives=compiler_directives,
+        annotate=False,
+    )
+
+
+def gen_from_templates(templates):
+    """Generate cython files from a list of templates"""
+    # Lazy import because cython is not a runtime dependency.
+    from Cython import Tempita
+
+    for template in templates:
+        outfile = template.replace(".tp", "")
+
+        # if the template is not updated, no need to output the cython file
+        if not (
+            os.path.exists(outfile)
+            and os.stat(template).st_mtime < os.stat(outfile).st_mtime
+        ):
+            with open(template, "r") as f:
+                tmpl = f.read()
+
+            tmpl_ = Tempita.sub(tmpl)
+
+            warn_msg = (
+                "# WARNING: Do not edit this file directly.\n"
+                f"# It is automatically generated from {template!r}.\n"
+                "# Changes must be made there.\n\n"
+            )
+
+            with open(outfile, "w") as f:
+                f.write(warn_msg)
+                f.write(tmpl_)
@@ -0,0 +1,127 @@
+"""Helpers for OpenMP support during the build."""
+
+# This code is adapted for a large part from the astropy openmp helpers, which
+# can be found at: https://github.com/astropy/extension-helpers/blob/master/extension_helpers/_openmp_helpers.py  # noqa
+
+
+import os
+import sys
+import textwrap
+import warnings
+
+from .pre_build_helpers import compile_test_program
+
+
+def get_openmp_flag():
+    if sys.platform == "win32":
+        return ["/openmp"]
+    elif sys.platform == "darwin" and "openmp" in os.getenv("CPPFLAGS", ""):
+        # -fopenmp can't be passed as compile flag when using Apple-clang.
+        # OpenMP support has to be enabled during preprocessing.
+        #
+        # For example, our macOS wheel build jobs use the following environment
+        # variables to build with Apple-clang and the brew installed "libomp":
+        #
+        # export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"
+        # export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
+        # export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
+        # export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib
+        #                          -L/usr/local/opt/libomp/lib -lomp"
+        return []
+    # Default flag for GCC and clang:
+    return ["-fopenmp"]
+
+
+def check_openmp_support():
+    """Check whether OpenMP test code can be compiled and run"""
+    if "PYODIDE" in os.environ:
+        # Pyodide doesn't support OpenMP
+        return False
+
+    code = textwrap.dedent(
+        """\
+        #include <omp.h>
+        #include <stdio.h>
+        int main(void) {
+        #pragma omp parallel
+        printf("nthreads=%d\\n", omp_get_num_threads());
+        return 0;
+        }
+        """
+    )
+
+    extra_preargs = os.getenv("LDFLAGS", None)
+    if extra_preargs is not None:
+        extra_preargs = extra_preargs.strip().split(" ")
+        # FIXME: temporary fix to link against system libraries on linux
+        # "-Wl,--sysroot=/" should be removed
+        extra_preargs = [
+            flag
+            for flag in extra_preargs
+            if flag.startswith(("-L", "-Wl,-rpath", "-l", "-Wl,--sysroot=/"))
+        ]
+
+    extra_postargs = get_openmp_flag()
+
+    openmp_exception = None
+    try:
+        output = compile_test_program(
+            code, extra_preargs=extra_preargs, extra_postargs=extra_postargs
+        )
+
+        if output and "nthreads=" in output[0]:
+            nthreads = int(output[0].strip().split("=")[1])
+            openmp_supported = len(output) == nthreads
+        elif "PYTHON_CROSSENV" in os.environ:
+            # Since we can't run the test program when cross-compiling
+            # assume that openmp is supported if the program can be
+            # compiled.
+            openmp_supported = True
+        else:
+            openmp_supported = False
+
+    except Exception as exception:
+        # We could be more specific and only catch: CompileError, LinkError,
+        # and subprocess.CalledProcessError.
+        # setuptools introduced CompileError and LinkError, but that requires
+        # version 61.1. Even the latest version of Ubuntu (22.04LTS) only
+        # ships with 59.6. So for now we catch all exceptions and reraise a
+        # generic exception with the original error message instead:
+        openmp_supported = False
+        openmp_exception = exception
+
+    if not openmp_supported:
+        if os.getenv("SKLEARN_FAIL_NO_OPENMP"):
+            raise Exception(
+                "Failed to build scikit-learn with OpenMP support"
+            ) from openmp_exception
+        else:
+            message = textwrap.dedent(
+                """
+
+                                ***********
+                                * WARNING *
+                                ***********
+
+                It seems that scikit-learn cannot be built with OpenMP.
+
+                - Make sure you have followed the installation instructions:
+
+                    https://scikit-learn.org/dev/developers/advanced_installation.html
+
+                - If your compiler supports OpenMP but you still see this
+                  message, please submit a bug report at:
+
+                    https://github.com/scikit-learn/scikit-learn/issues
+
+                - The build will continue with OpenMP-based parallelism
+                  disabled. Note however that some estimators will run in
+                  sequential mode instead of leveraging thread-based
+                  parallelism.
+
+                                    ***
+                """
+            )
+            warnings.warn(message)
+
+    return openmp_supported
@@ -0,0 +1,75 @@
+"""Helpers to check build environment before actual build of scikit-learn"""
+
+import glob
+import os
+import subprocess
+import sys
+import tempfile
+import textwrap
+
+from setuptools.command.build_ext import customize_compiler, new_compiler
+
+
+def compile_test_program(code, extra_preargs=None, extra_postargs=None):
+    """Check that some C code can be compiled and run"""
+    ccompiler = new_compiler()
+    customize_compiler(ccompiler)
+
+    start_dir = os.path.abspath(".")
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        try:
+            os.chdir(tmp_dir)
+
+            # Write test program
+            with open("test_program.c", "w") as f:
+                f.write(code)
+
+            os.mkdir("objects")
+
+            # Compile, test program
+            ccompiler.compile(
+                ["test_program.c"], output_dir="objects", extra_postargs=extra_postargs
+            )
+
+            # Link test program
+            objects = glob.glob(os.path.join("objects", "*" + ccompiler.obj_extension))
+            ccompiler.link_executable(
+                objects,
+                "test_program",
+                extra_preargs=extra_preargs,
+                extra_postargs=extra_postargs,
+            )
+
+            if "PYTHON_CROSSENV" not in os.environ:
+                # Run test program if not cross compiling
+                # will raise a CalledProcessError if return code was non-zero
+                output = subprocess.check_output("./test_program")
+                output = output.decode(sys.stdout.encoding or "utf-8").splitlines()
+            else:
+                # Return an empty output if we are cross compiling
+                # as we cannot run the test_program
+                output = []
+        except Exception:
+            raise
+        finally:
+            os.chdir(start_dir)
+
+    return output
+
+
+def basic_check_build():
+    """Check basic compilation and linking of C code"""
+    if "PYODIDE" in os.environ:
+        # The following check won't work in pyodide
+        return
+
+    code = textwrap.dedent(
+        """\
+        #include <stdio.h>
+        int main(void) {
+        return 0;
+        }
+        """
+    )
+    compile_test_program(code)
@@ -0,0 +1,57 @@
+import argparse
+import os
+
+from Cython import Tempita as tempita
+
+# XXX: If this import ever fails (does it really?), vendor either
+# cython.tempita or numpy/npy_tempita.
+
+
+def process_tempita(fromfile, outfile=None):
+    """Process tempita templated file and write out the result.
+
+    The template file is expected to end in `.c.tp` or `.pyx.tp`:
+    E.g. processing `template.c.in` generates `template.c`.
+
+    """
+    with open(fromfile, "r", encoding="utf-8") as f:
+        template_content = f.read()
+
+    template = tempita.Template(template_content)
+    content = template.substitute()
+
+    with open(outfile, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("infile", type=str, help="Path to the input file")
+    parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory")
+    parser.add_argument(
+        "-i",
+        "--ignore",
+        type=str,
+        help=(
+            "An ignored input - may be useful to add a "
+            "dependency between custom targets"
+        ),
+    )
+    args = parser.parse_args()
+
+    if not args.infile.endswith(".tp"):
+        raise ValueError(f"Unexpected extension: {args.infile}")
+
+    if not args.outdir:
+        raise ValueError("Missing `--outdir` argument to tempita.py")
+
+    outdir_abs = os.path.join(os.getcwd(), args.outdir)
+    outfile = os.path.join(
+        outdir_abs, os.path.splitext(os.path.split(args.infile)[1])[0]
+    )
+
+    process_tempita(args.infile, outfile)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+"""Extract version number from __init__.py"""
+
+import os
+
+sklearn_init = os.path.join(os.path.dirname(__file__), "../__init__.py")
+
+data = open(sklearn_init).readlines()
+version_line = next(line for line in data if line.startswith("__version__"))
+
+version = version_line.strip().split(" = ")[1].replace('"', "").replace("'", "")
+
+print(version)
@@ -0,0 +1,373 @@
+"""Global configuration state and functions for management"""
+
+import os
+import threading
+from contextlib import contextmanager as contextmanager
+
+_global_config = {
+    "assume_finite": bool(os.environ.get("SKLEARN_ASSUME_FINITE", False)),
+    "working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
+    "print_changed_only": True,
+    "display": "diagram",
+    "pairwise_dist_chunk_size": int(
+        os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256)
+    ),
+    "enable_cython_pairwise_dist": True,
+    "array_api_dispatch": False,
+    "transform_output": "default",
+    "enable_metadata_routing": False,
+    "skip_parameter_validation": False,
+}
+_threadlocal = threading.local()
+
+
+def _get_threadlocal_config():
+    """Get a threadlocal **mutable** configuration. If the configuration
+    does not exist, copy the default global configuration."""
+    if not hasattr(_threadlocal, "global_config"):
+        _threadlocal.global_config = _global_config.copy()
+    return _threadlocal.global_config
+
+
+def get_config():
+    """Retrieve current values for configuration set by :func:`set_config`.
+
+    Returns
+    -------
+    config : dict
+        Keys are parameter names that can be passed to :func:`set_config`.
+
+    See Also
+    --------
+    config_context : Context manager for global scikit-learn configuration.
+    set_config : Set global scikit-learn configuration.
+
+    Examples
+    --------
+    >>> import sklearn
+    >>> config = sklearn.get_config()
+    >>> config.keys()
+    dict_keys([...])
+    """
+    # Return a copy of the threadlocal configuration so that users will
+    # not be able to modify the configuration with the returned dict.
+    return _get_threadlocal_config().copy()
+
+
+def set_config(
+    assume_finite=None,
+    working_memory=None,
+    print_changed_only=None,
+    display=None,
+    pairwise_dist_chunk_size=None,
+    enable_cython_pairwise_dist=None,
+    array_api_dispatch=None,
+    transform_output=None,
+    enable_metadata_routing=None,
+    skip_parameter_validation=None,
+):
+    """Set global scikit-learn configuration.
+
+    .. versionadded:: 0.19
+
+    Parameters
+    ----------
+    assume_finite : bool, default=None
+        If True, validation for finiteness will be skipped,
+        saving time, but leading to potential crashes. If
+        False, validation for finiteness will be performed,
+        avoiding error.  Global default: False.
+
+        .. versionadded:: 0.19
+
+    working_memory : int, default=None
+        If set, scikit-learn will attempt to limit the size of temporary arrays
+        to this number of MiB (per job when parallelised), often saving both
+        computation time and memory on expensive operations that can be
+        performed in chunks. Global default: 1024.
+
+        .. versionadded:: 0.20
+
+    print_changed_only : bool, default=None
+        If True, only the parameters that were set to non-default
+        values will be printed when printing an estimator. For example,
+        ``print(SVC())`` while True will only print 'SVC()' while the default
+        behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with
+        all the non-changed parameters.
+
+        .. versionadded:: 0.21
+
+    display : {'text', 'diagram'}, default=None
+        If 'diagram', estimators will be displayed as a diagram in a Jupyter
+        lab or notebook context. If 'text', estimators will be displayed as
+        text. Default is 'diagram'.
+
+        .. versionadded:: 0.23
+
+    pairwise_dist_chunk_size : int, default=None
+        The number of row vectors per chunk for the accelerated pairwise-
+        distances reduction backend. Default is 256 (suitable for most of
+        modern laptops' caches and architectures).
+
+        Intended for easier benchmarking and testing of scikit-learn internals.
+        End users are not expected to benefit from customizing this configuration
+        setting.
+
+        .. versionadded:: 1.1
+
+    enable_cython_pairwise_dist : bool, default=None
+        Use the accelerated pairwise-distances reduction backend when
+        possible. Global default: True.
+
+        Intended for easier benchmarking and testing of scikit-learn internals.
+        End users are not expected to benefit from customizing this configuration
+        setting.
+
+        .. versionadded:: 1.1
+
+    array_api_dispatch : bool, default=None
+        Use Array API dispatching when inputs follow the Array API standard.
+        Default is False.
+
+        See the :ref:`User Guide <array_api>` for more details.
+
+        .. versionadded:: 1.2
+
+    transform_output : str, default=None
+        Configure output of `transform` and `fit_transform`.
+
+        See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
+        for an example on how to use the API.
+
+        - `"default"`: Default output format of a transformer
+        - `"pandas"`: DataFrame output
+        - `"polars"`: Polars output
+        - `None`: Transform configuration is unchanged
+
+        .. versionadded:: 1.2
+        .. versionadded:: 1.4
+            `"polars"` option was added.
+
+    enable_metadata_routing : bool, default=None
+        Enable metadata routing. By default this feature is disabled.
+
+        Refer to :ref:`metadata routing user guide <metadata_routing>` for more
+        details.
+
+        - `True`: Metadata routing is enabled
+        - `False`: Metadata routing is disabled, use the old syntax.
+        - `None`: Configuration is unchanged
+
+        .. versionadded:: 1.3
+
+    skip_parameter_validation : bool, default=None
+        If `True`, disable the validation of the hyper-parameters' types and values in
+        the fit method of estimators and for arguments passed to public helper
+        functions. It can save time in some situations but can lead to low level
+        crashes and exceptions with confusing error messages.
+
+        Note that for data parameters, such as `X` and `y`, only type validation is
+        skipped but validation with `check_array` will continue to run.
+
+        .. versionadded:: 1.3
+
+    See Also
+    --------
+    config_context : Context manager for global scikit-learn configuration.
+    get_config : Retrieve current values of the global configuration.
+
+    Examples
+    --------
+    >>> from sklearn import set_config
+    >>> set_config(display='diagram')  # doctest: +SKIP
+    """
+    local_config = _get_threadlocal_config()
+
+    if assume_finite is not None:
+        local_config["assume_finite"] = assume_finite
+    if working_memory is not None:
+        local_config["working_memory"] = working_memory
+    if print_changed_only is not None:
+        local_config["print_changed_only"] = print_changed_only
+    if display is not None:
+        local_config["display"] = display
+    if pairwise_dist_chunk_size is not None:
+        local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size
+    if enable_cython_pairwise_dist is not None:
+        local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist
+    if array_api_dispatch is not None:
+        from .utils._array_api import _check_array_api_dispatch
+
+        _check_array_api_dispatch(array_api_dispatch)
+        local_config["array_api_dispatch"] = array_api_dispatch
+    if transform_output is not None:
+        local_config["transform_output"] = transform_output
+    if enable_metadata_routing is not None:
+        local_config["enable_metadata_routing"] = enable_metadata_routing
+    if skip_parameter_validation is not None:
+        local_config["skip_parameter_validation"] = skip_parameter_validation
+
+
+@contextmanager
+def config_context(
+    *,
+    assume_finite=None,
+    working_memory=None,
+    print_changed_only=None,
+    display=None,
+    pairwise_dist_chunk_size=None,
+    enable_cython_pairwise_dist=None,
+    array_api_dispatch=None,
+    transform_output=None,
+    enable_metadata_routing=None,
+    skip_parameter_validation=None,
+):
+    """Context manager for global scikit-learn configuration.
+
+    Parameters
+    ----------
+    assume_finite : bool, default=None
+        If True, validation for finiteness will be skipped,
+        saving time, but leading to potential crashes. If
+        False, validation for finiteness will be performed,
+        avoiding error. If None, the existing value won't change.
+        The default value is False.
+
+    working_memory : int, default=None
+        If set, scikit-learn will attempt to limit the size of temporary arrays
+        to this number of MiB (per job when parallelised), often saving both
+        computation time and memory on expensive operations that can be
+        performed in chunks. If None, the existing value won't change.
+        The default value is 1024.
+
+    print_changed_only : bool, default=None
+        If True, only the parameters that were set to non-default
+        values will be printed when printing an estimator. For example,
+        ``print(SVC())`` while True will only print 'SVC()', but would print
+        'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
+        when False. If None, the existing value won't change.
+        The default value is True.
+
+        .. versionchanged:: 0.23
+           Default changed from False to True.
+
+    display : {'text', 'diagram'}, default=None
+        If 'diagram', estimators will be displayed as a diagram in a Jupyter
+        lab or notebook context. If 'text', estimators will be displayed as
+        text. If None, the existing value won't change.
+        The default value is 'diagram'.
+
+        .. versionadded:: 0.23
+
+    pairwise_dist_chunk_size : int, default=None
+        The number of row vectors per chunk for the accelerated pairwise-
+        distances reduction backend. Default is 256 (suitable for most of
+        modern laptops' caches and architectures).
+
+        Intended for easier benchmarking and testing of scikit-learn internals.
+        End users are not expected to benefit from customizing this configuration
+        setting.
+
+        .. versionadded:: 1.1
+
+    enable_cython_pairwise_dist : bool, default=None
+        Use the accelerated pairwise-distances reduction backend when
+        possible. Global default: True.
+
+        Intended for easier benchmarking and testing of scikit-learn internals.
+        End users are not expected to benefit from customizing this configuration
+        setting.
+
+        .. versionadded:: 1.1
+
+    array_api_dispatch : bool, default=None
+        Use Array API dispatching when inputs follow the Array API standard.
+        Default is False.
+
+        See the :ref:`User Guide <array_api>` for more details.
+
+        .. versionadded:: 1.2
+
+    transform_output : str, default=None
+        Configure output of `transform` and `fit_transform`.
+
+        See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
+        for an example on how to use the API.
+
+        - `"default"`: Default output format of a transformer
+        - `"pandas"`: DataFrame output
+        - `"polars"`: Polars output
+        - `None`: Transform configuration is unchanged
+
+        .. versionadded:: 1.2
+        .. versionadded:: 1.4
+            `"polars"` option was added.
+
+    enable_metadata_routing : bool, default=None
+        Enable metadata routing. By default this feature is disabled.
+
+        Refer to :ref:`metadata routing user guide <metadata_routing>` for more
+        details.
+
+        - `True`: Metadata routing is enabled
+        - `False`: Metadata routing is disabled, use the old syntax.
+        - `None`: Configuration is unchanged
+
+        .. versionadded:: 1.3
+
+    skip_parameter_validation : bool, default=None
+        If `True`, disable the validation of the hyper-parameters' types and values in
+        the fit method of estimators and for arguments passed to public helper
+        functions. It can save time in some situations but can lead to low level
+        crashes and exceptions with confusing error messages.
+
+        Note that for data parameters, such as `X` and `y`, only type validation is
+        skipped but validation with `check_array` will continue to run.
+
+        .. versionadded:: 1.3
+
+    Yields
+    ------
+    None.
+
+    See Also
+    --------
+    set_config : Set global scikit-learn configuration.
+    get_config : Retrieve current values of the global configuration.
+
+    Notes
+    -----
+    All settings, not just those presently modified, will be returned to
+    their previous values when the context manager is exited.
+
+    Examples
+    --------
+    >>> import sklearn
+    >>> from sklearn.utils.validation import assert_all_finite
+    >>> with sklearn.config_context(assume_finite=True):
+    ...     assert_all_finite([float('nan')])
+    >>> with sklearn.config_context(assume_finite=True):
+    ...     with sklearn.config_context(assume_finite=False):
+    ...         assert_all_finite([float('nan')])
+    Traceback (most recent call last):
+    ...
+    ValueError: Input contains NaN...
+    """
+    old_config = get_config()
+    set_config(
+        assume_finite=assume_finite,
+        working_memory=working_memory,
+        print_changed_only=print_changed_only,
+        display=display,
+        pairwise_dist_chunk_size=pairwise_dist_chunk_size,
+        enable_cython_pairwise_dist=enable_cython_pairwise_dist,
+        array_api_dispatch=array_api_dispatch,
+        transform_output=transform_output,
+        enable_metadata_routing=enable_metadata_routing,
+        skip_parameter_validation=skip_parameter_validation,
+    )
+
+    try:
+        yield
+    finally:
+        set_config(**old_config)
@@ -0,0 +1,10 @@
+"""Distributor init file
+
+Distributors: you can add custom code here to support particular distributions
+of scikit-learn.
+
+For example, this is a good place to put any checks for hardware requirements.
+
+The scikit-learn standard source distribution will not put code in this file,
+so you can safely replace this file with your own version.
+"""
@@ -0,0 +1,115 @@
+# Author: Nelle Varoquaux, Andrew Tulloch, Antony Lee
+
+# Uses the pool adjacent violators algorithm (PAVA), with the
+# enhancement of searching for the longest decreasing subsequence to
+# pool at each step.
+
+import numpy as np
+from cython cimport floating
+
+
+def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
+    cdef:
+        Py_ssize_t n = y.shape[0], i, k
+        floating prev_y, sum_wy, sum_w
+        Py_ssize_t[::1] target = np.arange(n, dtype=np.intp)
+
+    # target describes a list of blocks.  At any time, if [i..j] (inclusive) is
+    # an active block, then target[i] := j and target[j] := i.
+
+    # For "active" indices (block starts):
+    # w[i] := sum{w_orig[j], j=[i..target[i]]}
+    # y[i] := sum{y_orig[j]*w_orig[j], j=[i..target[i]]} / w[i]
+
+    with nogil:
+        i = 0
+        while i < n:
+            k = target[i] + 1
+            if k == n:
+                break
+            if y[i] < y[k]:
+                i = k
+                continue
+            sum_wy = w[i] * y[i]
+            sum_w = w[i]
+            while True:
+                # We are within a decreasing subsequence.
+                prev_y = y[k]
+                sum_wy += w[k] * y[k]
+                sum_w += w[k]
+                k = target[k] + 1
+                if k == n or prev_y < y[k]:
+                    # Non-singleton decreasing subsequence is finished,
+                    # update first entry.
+                    y[i] = sum_wy / sum_w
+                    w[i] = sum_w
+                    target[i] = k - 1
+                    target[k - 1] = i
+                    if i > 0:
+                        # Backtrack if we can.  This makes the algorithm
+                        # single-pass and ensures O(n) complexity.
+                        i = target[i - 1]
+                    # Otherwise, restart from the same point.
+                    break
+        # Reconstruct the solution.
+        i = 0
+        while i < n:
+            k = target[i] + 1
+            y[i + 1 : k] = y[i]
+            i = k
+
+
+def _make_unique(const floating[::1] X,
+                 const floating[::1] y,
+                 const floating[::1] sample_weights):
+    """Average targets for duplicate X, drop duplicates.
+
+    Aggregates duplicate X values into a single X value where
+    the target y is a (sample_weighted) average of the individual
+    targets.
+
+    Assumes that X is ordered, so that all duplicates follow each other.
+    """
+    unique_values = len(np.unique(X))
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    cdef floating[::1] y_out = np.empty(unique_values, dtype=dtype)
+    cdef floating[::1] x_out = np.empty_like(y_out)
+    cdef floating[::1] weights_out = np.empty_like(y_out)
+
+    cdef floating current_x = X[0]
+    cdef floating current_y = 0
+    cdef floating current_weight = 0
+    cdef int i = 0
+    cdef int j
+    cdef floating x
+    cdef int n_samples = len(X)
+    cdef floating eps = np.finfo(dtype).resolution
+
+    for j in range(n_samples):
+        x = X[j]
+        if x - current_x >= eps:
+            # next unique value
+            x_out[i] = current_x
+            weights_out[i] = current_weight
+            y_out[i] = current_y / current_weight
+            i += 1
+            current_x = x
+            current_weight = sample_weights[j]
+            current_y = y[j] * sample_weights[j]
+        else:
+            current_weight += sample_weights[j]
+            current_y += y[j] * sample_weights[j]
+
+    x_out[i] = current_x
+    weights_out[i] = current_weight
+    y_out[i] = current_y / current_weight
+    return(
+        np.asarray(x_out[:i+1]),
+        np.asarray(y_out[:i+1]),
+        np.asarray(weights_out[:i+1]),
+    )
@@ -0,0 +1,30 @@
+"""
+The :mod:`sklearn._loss` module includes loss function classes suitable for
+fitting classification and regression tasks.
+"""
+
+from .loss import (
+    AbsoluteError,
+    HalfBinomialLoss,
+    HalfGammaLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    HalfTweedieLossIdentity,
+    HuberLoss,
+    PinballLoss,
+)
+
+__all__ = [
+    "HalfSquaredError",
+    "AbsoluteError",
+    "PinballLoss",
+    "HuberLoss",
+    "HalfPoissonLoss",
+    "HalfGammaLoss",
+    "HalfTweedieLoss",
+    "HalfTweedieLossIdentity",
+    "HalfBinomialLoss",
+    "HalfMultinomialLoss",
+]
@@ -0,0 +1,91 @@
+# Fused types for input like y_true, raw_prediction, sample_weights.
+ctypedef fused floating_in:
+    double
+    float
+
+
+# Fused types for output like gradient and hessian
+# We use a different fused types for input (floating_in) and output (floating_out), such
+# that input and output can have different dtypes in the same function call. A single
+# fused type can only take on one single value (type) for all arguments in one function
+# call.
+ctypedef fused floating_out:
+    double
+    float
+
+
+# Struct to return 2 doubles
+ctypedef struct double_pair:
+    double val1
+    double val2
+
+
+# C base class for loss functions
+cdef class CyLossFunction:
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfSquaredError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyAbsoluteError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyPinballLoss(CyLossFunction):
+    cdef readonly double quantile  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHuberLoss(CyLossFunction):
+    cdef public double delta  # public makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfPoissonLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfGammaLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfTweedieLoss(CyLossFunction):
+    cdef readonly double power  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfTweedieLossIdentity(CyLossFunction):
+    cdef readonly double power  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyHalfBinomialLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+
+
+cdef class CyExponentialLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
@@ -0,0 +1,281 @@
+"""
+Module contains classes for invertible (and differentiable) link functions.
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import numpy as np
+from scipy.special import expit, logit
+from scipy.stats import gmean
+
+from ..utils.extmath import softmax
+
+
+@dataclass
+class Interval:
+    low: float
+    high: float
+    low_inclusive: bool
+    high_inclusive: bool
+
+    def __post_init__(self):
+        """Check that low <= high"""
+        if self.low > self.high:
+            raise ValueError(
+                f"One must have low <= high; got low={self.low}, high={self.high}."
+            )
+
+    def includes(self, x):
+        """Test whether all values of x are in interval range.
+
+        Parameters
+        ----------
+        x : ndarray
+            Array whose elements are tested to be in interval range.
+
+        Returns
+        -------
+        result : bool
+        """
+        if self.low_inclusive:
+            low = np.greater_equal(x, self.low)
+        else:
+            low = np.greater(x, self.low)
+
+        if not np.all(low):
+            return False
+
+        if self.high_inclusive:
+            high = np.less_equal(x, self.high)
+        else:
+            high = np.less(x, self.high)
+
+        # Note: np.all returns numpy.bool_
+        return bool(np.all(high))
+
+
+def _inclusive_low_high(interval, dtype=np.float64):
+    """Generate values low and high to be within the interval range.
+
+    This is used in tests only.
+
+    Returns
+    -------
+    low, high : tuple
+        The returned values low and high lie within the interval.
+    """
+    eps = 10 * np.finfo(dtype).eps
+    if interval.low == -np.inf:
+        low = -1e10
+    elif interval.low < 0:
+        low = interval.low * (1 - eps) + eps
+    else:
+        low = interval.low * (1 + eps) + eps
+
+    if interval.high == np.inf:
+        high = 1e10
+    elif interval.high < 0:
+        high = interval.high * (1 + eps) - eps
+    else:
+        high = interval.high * (1 - eps) - eps
+
+    return low, high
+
+
+class BaseLink(ABC):
+    """Abstract base class for differentiable, invertible link functions.
+
+    Convention:
+        - link function g: raw_prediction = g(y_pred)
+        - inverse link h: y_pred = h(raw_prediction)
+
+    For (generalized) linear models, `raw_prediction = X @ coef` is the so
+    called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
+    conditional (on X) expected value of the target `y_true`.
+
+    The methods are not implemented as staticmethods in case a link function needs
+    parameters.
+    """
+
+    is_multiclass = False  # used for testing only
+
+    # Usually, raw_prediction may be any real number and y_pred is an open
+    # interval.
+    # interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
+    interval_y_pred = Interval(-np.inf, np.inf, False, False)
+
+    @abstractmethod
+    def link(self, y_pred, out=None):
+        """Compute the link function g(y_pred).
+
+        The link function maps (predicted) target values to raw predictions,
+        i.e. `g(y_pred) = raw_prediction`.
+
+        Parameters
+        ----------
+        y_pred : array
+            Predicted target values.
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+
+        Returns
+        -------
+        out : array
+            Output array, element-wise link function.
+        """
+
+    @abstractmethod
+    def inverse(self, raw_prediction, out=None):
+        """Compute the inverse link function h(raw_prediction).
+
+        The inverse link function maps raw predictions to predicted target
+        values, i.e. `h(raw_prediction) = y_pred`.
+
+        Parameters
+        ----------
+        raw_prediction : array
+            Raw prediction values (in link space).
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+
+        Returns
+        -------
+        out : array
+            Output array, element-wise inverse link function.
+        """
+
+
+class IdentityLink(BaseLink):
+    """The identity link function g(x)=x."""
+
+    def link(self, y_pred, out=None):
+        if out is not None:
+            np.copyto(out, y_pred)
+            return out
+        else:
+            return y_pred
+
+    inverse = link
+
+
+class LogLink(BaseLink):
+    """The log link function g(x)=log(x)."""
+
+    interval_y_pred = Interval(0, np.inf, False, False)
+
+    def link(self, y_pred, out=None):
+        return np.log(y_pred, out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        return np.exp(raw_prediction, out=out)
+
+
+class LogitLink(BaseLink):
+    """The logit link function g(x)=logit(x)."""
+
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def link(self, y_pred, out=None):
+        return logit(y_pred, out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        return expit(raw_prediction, out=out)
+
+
+class HalfLogitLink(BaseLink):
+    """Half the logit link function g(x)=1/2 * logit(x).
+
+    Used for the exponential loss.
+    """
+
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def link(self, y_pred, out=None):
+        out = logit(y_pred, out=out)
+        out *= 0.5
+        return out
+
+    def inverse(self, raw_prediction, out=None):
+        return expit(2 * raw_prediction, out)
+
+
+class MultinomialLogit(BaseLink):
+    """The symmetric multinomial logit function.
+
+    Convention:
+        - y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
+
+    Notes:
+        - The inverse link h is the softmax function.
+        - The sum is over the second axis, i.e. axis=1 (n_classes).
+
+    We have to choose additional constraints in order to make
+
+        y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1)
+
+    for n_classes classes identifiable and invertible.
+    We choose the symmetric side constraint where the geometric mean response
+    is set as reference category, see [2]:
+
+    The symmetric multinomial logit link function for a single data point is
+    then defined as
+
+        raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
+        = log(y_pred[k]) - mean(log(y_pred)).
+
+    Note that this is equivalent to the definition in [1] and implies mean
+    centered raw predictions:
+
+        sum(raw_prediction[k], k=0..n_classes-1) = 0.
+
+    For linear models with raw_prediction = X @ coef, this corresponds to
+    sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
+    feature is zero.
+
+    Reference
+    ---------
+    .. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
+        logistic regression: a statistical view of boosting" Ann. Statist.
+        28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
+        https://projecteuclid.org/euclid.aos/1016218223
+
+    .. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
+        multinomial logit models with symmetric side constraints."
+        Computational Statistics 28 (2013): 1017-1034.
+        http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
+    """
+
+    is_multiclass = True
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def symmetrize_raw_prediction(self, raw_prediction):
+        return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
+
+    def link(self, y_pred, out=None):
+        # geometric mean as reference category
+        gm = gmean(y_pred, axis=1)
+        return np.log(y_pred / gm[:, np.newaxis], out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        if out is None:
+            return softmax(raw_prediction, copy=True)
+        else:
+            np.copyto(out, raw_prediction)
+            softmax(out, copy=False)
+            return out
+
+
+_LINKS = {
+    "identity": IdentityLink,
+    "log": LogLink,
+    "logit": LogitLink,
+    "half_logit": HalfLogitLink,
+    "multinomial_logit": MultinomialLogit,
+}
@@ -0,0 +1,19 @@
+# .pyx is generated, so this is needed to make Cython compilation work
+_loss_cython_tree = [
+  fs.copyfile('_loss.pxd')
+]
+
+_loss_pyx = custom_target(
+  '_loss_pyx',
+  output: '_loss.pyx',
+  input: '_loss.pyx.tp',
+  command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'],
+)
+
+py.extension_module(
+  '_loss',
+  [_loss_pyx, _loss_cython_tree],
+  cython_args: cython_args,
+  install: true,
+  subdir: 'sklearn/_loss',
+)
@@ -0,0 +1,111 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+from sklearn._loss.link import (
+    _LINKS,
+    HalfLogitLink,
+    Interval,
+    MultinomialLogit,
+    _inclusive_low_high,
+)
+
+LINK_FUNCTIONS = list(_LINKS.values())
+
+
+def test_interval_raises():
+    """Test that interval with low > high raises ValueError."""
+    with pytest.raises(
+        ValueError, match="One must have low <= high; got low=1, high=0."
+    ):
+        Interval(1, 0, False, False)
+
+
+@pytest.mark.parametrize(
+    "interval",
+    [
+        Interval(0, 1, False, False),
+        Interval(0, 1, False, True),
+        Interval(0, 1, True, False),
+        Interval(0, 1, True, True),
+        Interval(-np.inf, np.inf, False, False),
+        Interval(-np.inf, np.inf, False, True),
+        Interval(-np.inf, np.inf, True, False),
+        Interval(-np.inf, np.inf, True, True),
+        Interval(-10, -1, False, False),
+        Interval(-10, -1, False, True),
+        Interval(-10, -1, True, False),
+        Interval(-10, -1, True, True),
+    ],
+)
+def test_is_in_range(interval):
+    # make sure low and high are always within the interval, used for linspace
+    low, high = _inclusive_low_high(interval)
+
+    x = np.linspace(low, high, num=10)
+    assert interval.includes(x)
+
+    # x contains lower bound
+    assert interval.includes(np.r_[x, interval.low]) == interval.low_inclusive
+
+    # x contains upper bound
+    assert interval.includes(np.r_[x, interval.high]) == interval.high_inclusive
+
+    # x contains upper and lower bound
+    assert interval.includes(np.r_[x, interval.low, interval.high]) == (
+        interval.low_inclusive and interval.high_inclusive
+    )
+
+
+@pytest.mark.parametrize("link", LINK_FUNCTIONS)
+def test_link_inverse_identity(link, global_random_seed):
+    # Test that link of inverse gives identity.
+    rng = np.random.RandomState(global_random_seed)
+    link = link()
+    n_samples, n_classes = 100, None
+    # The values for `raw_prediction` are limited from -20 to 20 because in the
+    # class `LogitLink` the term `expit(x)` comes very close to 1 for large
+    # positive x and therefore loses precision.
+    if link.is_multiclass:
+        n_classes = 10
+        raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples, n_classes))
+        if isinstance(link, MultinomialLogit):
+            raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
+    elif isinstance(link, HalfLogitLink):
+        raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
+    else:
+        raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples))
+
+    assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
+    y_pred = link.inverse(raw_prediction)
+    assert_allclose(link.inverse(link.link(y_pred)), y_pred)
+
+
+@pytest.mark.parametrize("link", LINK_FUNCTIONS)
+def test_link_out_argument(link):
+    # Test that out argument gets assigned the result.
+    rng = np.random.RandomState(42)
+    link = link()
+    n_samples, n_classes = 100, None
+    if link.is_multiclass:
+        n_classes = 10
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
+        if isinstance(link, MultinomialLogit):
+            raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
+    else:
+        # So far, the valid interval of raw_prediction is (-inf, inf) and
+        # we do not need to distinguish.
+        raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
+
+    y_pred = link.inverse(raw_prediction, out=None)
+    out = np.empty_like(raw_prediction)
+    y_pred_2 = link.inverse(raw_prediction, out=out)
+    assert_allclose(y_pred, out)
+    assert_array_equal(out, y_pred_2)
+    assert np.shares_memory(out, y_pred_2)
+
+    out = np.empty_like(y_pred)
+    raw_prediction_2 = link.link(y_pred, out=out)
+    assert_allclose(raw_prediction, out)
+    assert_array_equal(out, raw_prediction_2)
+    assert np.shares_memory(out, raw_prediction_2)
@@ -0,0 +1,67 @@
+"""All minimum dependencies for scikit-learn."""
+
+import argparse
+from collections import defaultdict
+
+# scipy and cython should by in sync with pyproject.toml
+NUMPY_MIN_VERSION = "1.19.5"
+SCIPY_MIN_VERSION = "1.6.0"
+JOBLIB_MIN_VERSION = "1.2.0"
+THREADPOOLCTL_MIN_VERSION = "3.1.0"
+PYTEST_MIN_VERSION = "7.1.2"
+CYTHON_MIN_VERSION = "3.0.10"
+
+
+# 'build' and 'install' is included to have structured metadata for CI.
+# It will NOT be included in setup's extras_require
+# The values are (version_spec, comma separated tags)
+dependent_packages = {
+    "numpy": (NUMPY_MIN_VERSION, "build, install"),
+    "scipy": (SCIPY_MIN_VERSION, "build, install"),
+    "joblib": (JOBLIB_MIN_VERSION, "install"),
+    "threadpoolctl": (THREADPOOLCTL_MIN_VERSION, "install"),
+    "cython": (CYTHON_MIN_VERSION, "build"),
+    "meson-python": ("0.15.0", "build"),
+    "matplotlib": ("3.3.4", "benchmark, docs, examples, tests"),
+    "scikit-image": ("0.17.2", "docs, examples, tests"),
+    "pandas": ("1.1.5", "benchmark, docs, examples, tests"),
+    "seaborn": ("0.9.0", "docs, examples"),
+    "memory_profiler": ("0.57.0", "benchmark, docs"),
+    "pytest": (PYTEST_MIN_VERSION, "tests"),
+    "pytest-cov": ("2.9.0", "tests"),
+    "ruff": ("0.2.1", "tests"),
+    "black": ("24.3.0", "tests"),
+    "mypy": ("1.9", "tests"),
+    "pyamg": ("4.0.0", "tests"),
+    "polars": ("0.20.23", "docs, tests"),
+    "pyarrow": ("12.0.0", "tests"),
+    "sphinx": ("6.0.0", "docs"),
+    "sphinx-copybutton": ("0.5.2", "docs"),
+    "sphinx-gallery": ("0.15.0", "docs"),
+    "numpydoc": ("1.2.0", "docs, tests"),
+    "Pillow": ("7.1.2", "docs"),
+    "pooch": ("1.6.0", "docs, examples, tests"),
+    "sphinx-prompt": ("1.3.0", "docs"),
+    "sphinxext-opengraph": ("0.4.2", "docs"),
+    "plotly": ("5.14.0", "docs, examples"),
+    # XXX: Pin conda-lock to the latest released version (needs manual update
+    # from time to time)
+    "conda-lock": ("2.5.6", "maintenance"),
+}
+
+
+# create inverse mapping for setuptools
+tag_to_packages: dict = defaultdict(list)
+for package, (min_version, extras) in dependent_packages.items():
+    for extra in extras.split(", "):
+        tag_to_packages[extra].append("{}>={}".format(package, min_version))
+
+
+# Used by CI to get the min dependencies
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Get min dependencies for a package")
+
+    parser.add_argument("package", choices=dependent_packages)
+    args = parser.parse_args()
+    min_version = dependent_packages[args.package][0]
+    print(min_version)
@@ -0,0 +1,56 @@
+"""
+The :mod:`sklearn.cluster` module gathers popular unsupervised clustering
+algorithms.
+"""
+
+from ._affinity_propagation import AffinityPropagation, affinity_propagation
+from ._agglomerative import (
+    AgglomerativeClustering,
+    FeatureAgglomeration,
+    linkage_tree,
+    ward_tree,
+)
+from ._bicluster import SpectralBiclustering, SpectralCoclustering
+from ._birch import Birch
+from ._bisect_k_means import BisectingKMeans
+from ._dbscan import DBSCAN, dbscan
+from ._hdbscan.hdbscan import HDBSCAN
+from ._kmeans import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus
+from ._mean_shift import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift
+from ._optics import (
+    OPTICS,
+    cluster_optics_dbscan,
+    cluster_optics_xi,
+    compute_optics_graph,
+)
+from ._spectral import SpectralClustering, spectral_clustering
+
+__all__ = [
+    "AffinityPropagation",
+    "AgglomerativeClustering",
+    "Birch",
+    "DBSCAN",
+    "OPTICS",
+    "cluster_optics_dbscan",
+    "cluster_optics_xi",
+    "compute_optics_graph",
+    "KMeans",
+    "BisectingKMeans",
+    "FeatureAgglomeration",
+    "MeanShift",
+    "MiniBatchKMeans",
+    "SpectralClustering",
+    "affinity_propagation",
+    "dbscan",
+    "estimate_bandwidth",
+    "get_bin_seeds",
+    "k_means",
+    "kmeans_plusplus",
+    "linkage_tree",
+    "mean_shift",
+    "spectral_clustering",
+    "ward_tree",
+    "SpectralBiclustering",
+    "SpectralCoclustering",
+    "HDBSCAN",
+]
@@ -0,0 +1,604 @@
+"""Affinity Propagation clustering algorithm."""
+
+# Author: Alexandre Gramfort alexandre.gramfort@inria.fr
+#        Gael Varoquaux gael.varoquaux@normalesup.org
+
+# License: BSD 3 clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+
+from .._config import config_context
+from ..base import BaseEstimator, ClusterMixin, _fit_context
+from ..exceptions import ConvergenceWarning
+from ..metrics import euclidean_distances, pairwise_distances_argmin
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.validation import check_is_fitted
+
+
+def _equal_similarities_and_preferences(S, preference):
+    def all_equal_preferences():
+        return np.all(preference == preference.flat[0])
+
+    def all_equal_similarities():
+        # Create mask to ignore diagonal of S
+        mask = np.ones(S.shape, dtype=bool)
+        np.fill_diagonal(mask, 0)
+
+        return np.all(S[mask].flat == S[mask].flat[0])
+
+    return all_equal_preferences() and all_equal_similarities()
+
+
+def _affinity_propagation(
+    S,
+    *,
+    preference,
+    convergence_iter,
+    max_iter,
+    damping,
+    verbose,
+    return_n_iter,
+    random_state,
+):
+    """Main affinity propagation algorithm."""
+    n_samples = S.shape[0]
+    if n_samples == 1 or _equal_similarities_and_preferences(S, preference):
+        # It makes no sense to run the algorithm in this case, so return 1 or
+        # n_samples clusters, depending on preferences
+        warnings.warn(
+            "All samples have mutually equal similarities. "
+            "Returning arbitrary cluster center(s)."
+        )
+        if preference.flat[0] > S.flat[n_samples - 1]:
+            return (
+                (np.arange(n_samples), np.arange(n_samples), 0)
+                if return_n_iter
+                else (np.arange(n_samples), np.arange(n_samples))
+            )
+        else:
+            return (
+                (np.array([0]), np.array([0] * n_samples), 0)
+                if return_n_iter
+                else (np.array([0]), np.array([0] * n_samples))
+            )
+
+    # Place preference on the diagonal of S
+    S.flat[:: (n_samples + 1)] = preference
+
+    A = np.zeros((n_samples, n_samples))
+    R = np.zeros((n_samples, n_samples))  # Initialize messages
+    # Intermediate results
+    tmp = np.zeros((n_samples, n_samples))
+
+    # Remove degeneracies
+    S += (
+        np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100
+    ) * random_state.standard_normal(size=(n_samples, n_samples))
+
+    # Execute parallel affinity propagation updates
+    e = np.zeros((n_samples, convergence_iter))
+
+    ind = np.arange(n_samples)
+
+    for it in range(max_iter):
+        # tmp = A + S; compute responsibilities
+        np.add(A, S, tmp)
+        I = np.argmax(tmp, axis=1)
+        Y = tmp[ind, I]  # np.max(A + S, axis=1)
+        tmp[ind, I] = -np.inf
+        Y2 = np.max(tmp, axis=1)
+
+        # tmp = Rnew
+        np.subtract(S, Y[:, None], tmp)
+        tmp[ind, I] = S[ind, I] - Y2
+
+        # Damping
+        tmp *= 1 - damping
+        R *= damping
+        R += tmp
+
+        # tmp = Rp; compute availabilities
+        np.maximum(R, 0, tmp)
+        tmp.flat[:: n_samples + 1] = R.flat[:: n_samples + 1]
+
+        # tmp = -Anew
+        tmp -= np.sum(tmp, axis=0)
+        dA = np.diag(tmp).copy()
+        tmp.clip(0, np.inf, tmp)
+        tmp.flat[:: n_samples + 1] = dA
+
+        # Damping
+        tmp *= 1 - damping
+        A *= damping
+        A -= tmp
+
+        # Check for convergence
+        E = (np.diag(A) + np.diag(R)) > 0
+        e[:, it % convergence_iter] = E
+        K = np.sum(E, axis=0)
+
+        if it >= convergence_iter:
+            se = np.sum(e, axis=1)
+            unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples
+            if (not unconverged and (K > 0)) or (it == max_iter):
+                never_converged = False
+                if verbose:
+                    print("Converged after %d iterations." % it)
+                break
+    else:
+        never_converged = True
+        if verbose:
+            print("Did not converge")
+
+    I = np.flatnonzero(E)
+    K = I.size  # Identify exemplars
+
+    if K > 0:
+        if never_converged:
+            warnings.warn(
+                (
+                    "Affinity propagation did not converge, this model "
+                    "may return degenerate cluster centers and labels."
+                ),
+                ConvergenceWarning,
+            )
+        c = np.argmax(S[:, I], axis=1)
+        c[I] = np.arange(K)  # Identify clusters
+        # Refine the final set of exemplars and clusters and return results
+        for k in range(K):
+            ii = np.where(c == k)[0]
+            j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
+            I[k] = ii[j]
+
+        c = np.argmax(S[:, I], axis=1)
+        c[I] = np.arange(K)
+        labels = I[c]
+        # Reduce labels to a sorted, gapless, list
+        cluster_centers_indices = np.unique(labels)
+        labels = np.searchsorted(cluster_centers_indices, labels)
+    else:
+        warnings.warn(
+            (
+                "Affinity propagation did not converge and this model "
+                "will not have any cluster centers."
+            ),
+            ConvergenceWarning,
+        )
+        labels = np.array([-1] * n_samples)
+        cluster_centers_indices = []
+
+    if return_n_iter:
+        return cluster_centers_indices, labels, it + 1
+    else:
+        return cluster_centers_indices, labels
+
+
+###############################################################################
+# Public API
+
+
+@validate_params(
+    {
+        "S": ["array-like"],
+        "return_n_iter": ["boolean"],
+    },
+    prefer_skip_nested_validation=False,
+)
+def affinity_propagation(
+    S,
+    *,
+    preference=None,
+    convergence_iter=15,
+    max_iter=200,
+    damping=0.5,
+    copy=True,
+    verbose=False,
+    return_n_iter=False,
+    random_state=None,
+):
+    """Perform Affinity Propagation Clustering of data.
+
+    Read more in the :ref:`User Guide <affinity_propagation>`.
+
+    Parameters
+    ----------
+    S : array-like of shape (n_samples, n_samples)
+        Matrix of similarities between points.
+
+    preference : array-like of shape (n_samples,) or float, default=None
+        Preferences for each point - points with larger values of
+        preferences are more likely to be chosen as exemplars. The number of
+        exemplars, i.e. of clusters, is influenced by the input preferences
+        value. If the preferences are not passed as arguments, they will be
+        set to the median of the input similarities (resulting in a moderate
+        number of clusters). For a smaller amount of clusters, this can be set
+        to the minimum value of the similarities.
+
+    convergence_iter : int, default=15
+        Number of iterations with no change in the number
+        of estimated clusters that stops the convergence.
+
+    max_iter : int, default=200
+        Maximum number of iterations.
+
+    damping : float, default=0.5
+        Damping factor between 0.5 and 1.
+
+    copy : bool, default=True
+        If copy is False, the affinity matrix is modified inplace by the
+        algorithm, for memory efficiency.
+
+    verbose : bool, default=False
+        The verbosity level.
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo-random number generator to control the starting state.
+        Use an int for reproducible results across function calls.
+        See the :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.23
+            this parameter was previously hardcoded as 0.
+
+    Returns
+    -------
+    cluster_centers_indices : ndarray of shape (n_clusters,)
+        Index of clusters centers.
+
+    labels : ndarray of shape (n_samples,)
+        Cluster labels for each point.
+
+    n_iter : int
+        Number of iterations run. Returned only if `return_n_iter` is
+        set to True.
+
+    Notes
+    -----
+    For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
+    <sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
+
+    When the algorithm does not converge, it will still return a arrays of
+    ``cluster_center_indices`` and labels if there are any exemplars/clusters,
+    however they may be degenerate and should be used with caution.
+
+    When all training samples have equal similarities and equal preferences,
+    the assignment of cluster centers and labels depends on the preference.
+    If the preference is smaller than the similarities, a single cluster center
+    and label ``0`` for every sample will be returned. Otherwise, every
+    training sample becomes its own cluster center and is assigned a unique
+    label.
+
+    References
+    ----------
+    Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
+    Between Data Points", Science Feb. 2007
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import affinity_propagation
+    >>> from sklearn.metrics.pairwise import euclidean_distances
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [4, 2], [4, 4], [4, 0]])
+    >>> S = -euclidean_distances(X, squared=True)
+    >>> cluster_centers_indices, labels = affinity_propagation(S, random_state=0)
+    >>> cluster_centers_indices
+    array([0, 3])
+    >>> labels
+    array([0, 0, 0, 1, 1, 1])
+    """
+    estimator = AffinityPropagation(
+        damping=damping,
+        max_iter=max_iter,
+        convergence_iter=convergence_iter,
+        copy=copy,
+        preference=preference,
+        affinity="precomputed",
+        verbose=verbose,
+        random_state=random_state,
+    ).fit(S)
+
+    if return_n_iter:
+        return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_
+    return estimator.cluster_centers_indices_, estimator.labels_
+
+
+class AffinityPropagation(ClusterMixin, BaseEstimator):
+    """Perform Affinity Propagation Clustering of data.
+
+    Read more in the :ref:`User Guide <affinity_propagation>`.
+
+    Parameters
+    ----------
+    damping : float, default=0.5
+        Damping factor in the range `[0.5, 1.0)` is the extent to
+        which the current value is maintained relative to
+        incoming values (weighted 1 - damping). This in order
+        to avoid numerical oscillations when updating these
+        values (messages).
+
+    max_iter : int, default=200
+        Maximum number of iterations.
+
+    convergence_iter : int, default=15
+        Number of iterations with no change in the number
+        of estimated clusters that stops the convergence.
+
+    copy : bool, default=True
+        Make a copy of input data.
+
+    preference : array-like of shape (n_samples,) or float, default=None
+        Preferences for each point - points with larger values of
+        preferences are more likely to be chosen as exemplars. The number
+        of exemplars, ie of clusters, is influenced by the input
+        preferences value. If the preferences are not passed as arguments,
+        they will be set to the median of the input similarities.
+
+    affinity : {'euclidean', 'precomputed'}, default='euclidean'
+        Which affinity to use. At the moment 'precomputed' and
+        ``euclidean`` are supported. 'euclidean' uses the
+        negative squared euclidean distance between points.
+
+    verbose : bool, default=False
+        Whether to be verbose.
+
+    random_state : int, RandomState instance or None, default=None
+        Pseudo-random number generator to control the starting state.
+        Use an int for reproducible results across function calls.
+        See the :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.23
+            this parameter was previously hardcoded as 0.
+
+    Attributes
+    ----------
+    cluster_centers_indices_ : ndarray of shape (n_clusters,)
+        Indices of cluster centers.
+
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
+        Cluster centers (if affinity != ``precomputed``).
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point.
+
+    affinity_matrix_ : ndarray of shape (n_samples, n_samples)
+        Stores the affinity matrix used in ``fit``.
+
+    n_iter_ : int
+        Number of iterations taken to converge.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    AgglomerativeClustering : Recursively merges the pair of
+        clusters that minimally increases a given linkage distance.
+    FeatureAgglomeration : Similar to AgglomerativeClustering,
+        but recursively merges features instead of samples.
+    KMeans : K-Means clustering.
+    MiniBatchKMeans : Mini-Batch K-Means clustering.
+    MeanShift : Mean shift clustering using a flat kernel.
+    SpectralClustering : Apply clustering to a projection
+        of the normalized Laplacian.
+
+    Notes
+    -----
+    For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
+    <sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
+
+    The algorithmic complexity of affinity propagation is quadratic
+    in the number of points.
+
+    When the algorithm does not converge, it will still return a arrays of
+    ``cluster_center_indices`` and labels if there are any exemplars/clusters,
+    however they may be degenerate and should be used with caution.
+
+    When ``fit`` does not converge, ``cluster_centers_`` is still populated
+    however it may be degenerate. In such a case, proceed with caution.
+    If ``fit`` does not converge and fails to produce any ``cluster_centers_``
+    then ``predict`` will label every sample as ``-1``.
+
+    When all training samples have equal similarities and equal preferences,
+    the assignment of cluster centers and labels depends on the preference.
+    If the preference is smaller than the similarities, ``fit`` will result in
+    a single cluster center and label ``0`` for every sample. Otherwise, every
+    training sample becomes its own cluster center and is assigned a unique
+    label.
+
+    References
+    ----------
+
+    Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
+    Between Data Points", Science Feb. 2007
+
+    Examples
+    --------
+    >>> from sklearn.cluster import AffinityPropagation
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [4, 2], [4, 4], [4, 0]])
+    >>> clustering = AffinityPropagation(random_state=5).fit(X)
+    >>> clustering
+    AffinityPropagation(random_state=5)
+    >>> clustering.labels_
+    array([0, 0, 0, 1, 1, 1])
+    >>> clustering.predict([[0, 0], [4, 4]])
+    array([0, 1])
+    >>> clustering.cluster_centers_
+    array([[1, 2],
+           [4, 2]])
+    """
+
+    _parameter_constraints: dict = {
+        "damping": [Interval(Real, 0.5, 1.0, closed="left")],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "convergence_iter": [Interval(Integral, 1, None, closed="left")],
+        "copy": ["boolean"],
+        "preference": [
+            "array-like",
+            Interval(Real, None, None, closed="neither"),
+            None,
+        ],
+        "affinity": [StrOptions({"euclidean", "precomputed"})],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        *,
+        damping=0.5,
+        max_iter=200,
+        convergence_iter=15,
+        copy=True,
+        preference=None,
+        affinity="euclidean",
+        verbose=False,
+        random_state=None,
+    ):
+        self.damping = damping
+        self.max_iter = max_iter
+        self.convergence_iter = convergence_iter
+        self.copy = copy
+        self.verbose = verbose
+        self.preference = preference
+        self.affinity = affinity
+        self.random_state = random_state
+
+    def _more_tags(self):
+        return {"pairwise": self.affinity == "precomputed"}
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the clustering from features, or affinity matrix.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+                array-like of shape (n_samples, n_samples)
+            Training instances to cluster, or similarities / affinities between
+            instances if ``affinity='precomputed'``. If a sparse feature matrix
+            is provided, it will be converted into a sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Returns the instance itself.
+        """
+        if self.affinity == "precomputed":
+            accept_sparse = False
+        else:
+            accept_sparse = "csr"
+        X = self._validate_data(X, accept_sparse=accept_sparse)
+        if self.affinity == "precomputed":
+            self.affinity_matrix_ = X.copy() if self.copy else X
+        else:  # self.affinity == "euclidean"
+            self.affinity_matrix_ = -euclidean_distances(X, squared=True)
+
+        if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
+            raise ValueError(
+                "The matrix of similarities must be a square array. "
+                f"Got {self.affinity_matrix_.shape} instead."
+            )
+
+        if self.preference is None:
+            preference = np.median(self.affinity_matrix_)
+        else:
+            preference = self.preference
+        preference = np.asarray(preference)
+
+        random_state = check_random_state(self.random_state)
+
+        (
+            self.cluster_centers_indices_,
+            self.labels_,
+            self.n_iter_,
+        ) = _affinity_propagation(
+            self.affinity_matrix_,
+            max_iter=self.max_iter,
+            convergence_iter=self.convergence_iter,
+            preference=preference,
+            damping=self.damping,
+            verbose=self.verbose,
+            return_n_iter=True,
+            random_state=random_state,
+        )
+
+        if self.affinity != "precomputed":
+            self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
+
+        return self
+
+    def predict(self, X):
+        """Predict the closest cluster each sample in X belongs to.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to predict. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels.
+        """
+        check_is_fitted(self)
+        X = self._validate_data(X, reset=False, accept_sparse="csr")
+        if not hasattr(self, "cluster_centers_"):
+            raise ValueError(
+                "Predict method is not supported when affinity='precomputed'."
+            )
+
+        if self.cluster_centers_.shape[0] > 0:
+            with config_context(assume_finite=True):
+                return pairwise_distances_argmin(X, self.cluster_centers_)
+        else:
+            warnings.warn(
+                (
+                    "This model does not have any cluster centers "
+                    "because affinity propagation did not converge. "
+                    "Labeling every sample as '-1'."
+                ),
+                ConvergenceWarning,
+            )
+            return np.array([-1] * X.shape[0])
+
+    def fit_predict(self, X, y=None):
+        """Fit clustering from features/affinity matrix; return cluster labels.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+                array-like of shape (n_samples, n_samples)
+            Training instances to cluster, or similarities / affinities between
+            instances if ``affinity='precomputed'``. If a sparse feature matrix
+            is provided, it will be converted into a sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels.
+        """
+        return super().fit_predict(X, y)
@@ -0,0 +1,624 @@
+"""Spectral biclustering algorithms."""
+
+# Authors : Kemal Eren
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+from numbers import Integral
+
+import numpy as np
+from scipy.linalg import norm
+from scipy.sparse import dia_matrix, issparse
+from scipy.sparse.linalg import eigsh, svds
+
+from ..base import BaseEstimator, BiclusterMixin, _fit_context
+from ..utils import check_random_state, check_scalar
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.extmath import make_nonnegative, randomized_svd, safe_sparse_dot
+from ..utils.validation import assert_all_finite
+from ._kmeans import KMeans, MiniBatchKMeans
+
+__all__ = ["SpectralCoclustering", "SpectralBiclustering"]
+
+
+def _scale_normalize(X):
+    """Normalize ``X`` by scaling rows and columns independently.
+
+    Returns the normalized matrix and the row and column scaling
+    factors.
+    """
+    X = make_nonnegative(X)
+    row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
+    col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
+    row_diag = np.where(np.isnan(row_diag), 0, row_diag)
+    col_diag = np.where(np.isnan(col_diag), 0, col_diag)
+    if issparse(X):
+        n_rows, n_cols = X.shape
+        r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
+        c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
+        an = r * X * c
+    else:
+        an = row_diag[:, np.newaxis] * X * col_diag
+    return an, row_diag, col_diag
+
+
+def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
+    """Normalize rows and columns of ``X`` simultaneously so that all
+    rows sum to one constant and all columns sum to a different
+    constant.
+    """
+    # According to paper, this can also be done more efficiently with
+    # deviation reduction and balancing algorithms.
+    X = make_nonnegative(X)
+    X_scaled = X
+    for _ in range(max_iter):
+        X_new, _, _ = _scale_normalize(X_scaled)
+        if issparse(X):
+            dist = norm(X_scaled.data - X.data)
+        else:
+            dist = norm(X_scaled - X_new)
+        X_scaled = X_new
+        if dist is not None and dist < tol:
+            break
+    return X_scaled
+
+
+def _log_normalize(X):
+    """Normalize ``X`` according to Kluger's log-interactions scheme."""
+    X = make_nonnegative(X, min_value=1)
+    if issparse(X):
+        raise ValueError(
+            "Cannot compute log of a sparse matrix,"
+            " because log(x) diverges to -infinity as x"
+            " goes to 0."
+        )
+    L = np.log(X)
+    row_avg = L.mean(axis=1)[:, np.newaxis]
+    col_avg = L.mean(axis=0)
+    avg = L.mean()
+    return L - row_avg - col_avg + avg
+
+
+class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
+    """Base class for spectral biclustering."""
+
+    _parameter_constraints: dict = {
+        "svd_method": [StrOptions({"randomized", "arpack"})],
+        "n_svd_vecs": [Interval(Integral, 0, None, closed="left"), None],
+        "mini_batch": ["boolean"],
+        "init": [StrOptions({"k-means++", "random"}), np.ndarray],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
+    }
+
+    @abstractmethod
+    def __init__(
+        self,
+        n_clusters=3,
+        svd_method="randomized",
+        n_svd_vecs=None,
+        mini_batch=False,
+        init="k-means++",
+        n_init=10,
+        random_state=None,
+    ):
+        self.n_clusters = n_clusters
+        self.svd_method = svd_method
+        self.n_svd_vecs = n_svd_vecs
+        self.mini_batch = mini_batch
+        self.init = init
+        self.n_init = n_init
+        self.random_state = random_state
+
+    @abstractmethod
+    def _check_parameters(self, n_samples):
+        """Validate parameters depending on the input data."""
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Create a biclustering for X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            SpectralBiclustering instance.
+        """
+        X = self._validate_data(X, accept_sparse="csr", dtype=np.float64)
+        self._check_parameters(X.shape[0])
+        self._fit(X)
+        return self
+
+    def _svd(self, array, n_components, n_discard):
+        """Returns first `n_components` left and right singular
+        vectors u and v, discarding the first `n_discard`.
+        """
+        if self.svd_method == "randomized":
+            kwargs = {}
+            if self.n_svd_vecs is not None:
+                kwargs["n_oversamples"] = self.n_svd_vecs
+            u, _, vt = randomized_svd(
+                array, n_components, random_state=self.random_state, **kwargs
+            )
+
+        elif self.svd_method == "arpack":
+            u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
+            if np.any(np.isnan(vt)):
+                # some eigenvalues of A * A.T are negative, causing
+                # sqrt() to be np.nan. This causes some vectors in vt
+                # to be np.nan.
+                A = safe_sparse_dot(array.T, array)
+                random_state = check_random_state(self.random_state)
+                # initialize with [-1,1] as in ARPACK
+                v0 = random_state.uniform(-1, 1, A.shape[0])
+                _, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
+                vt = v.T
+            if np.any(np.isnan(u)):
+                A = safe_sparse_dot(array, array.T)
+                random_state = check_random_state(self.random_state)
+                # initialize with [-1,1] as in ARPACK
+                v0 = random_state.uniform(-1, 1, A.shape[0])
+                _, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
+
+        assert_all_finite(u)
+        assert_all_finite(vt)
+        u = u[:, n_discard:]
+        vt = vt[n_discard:]
+        return u, vt.T
+
+    def _k_means(self, data, n_clusters):
+        if self.mini_batch:
+            model = MiniBatchKMeans(
+                n_clusters,
+                init=self.init,
+                n_init=self.n_init,
+                random_state=self.random_state,
+            )
+        else:
+            model = KMeans(
+                n_clusters,
+                init=self.init,
+                n_init=self.n_init,
+                random_state=self.random_state,
+            )
+        model.fit(data)
+        centroid = model.cluster_centers_
+        labels = model.labels_
+        return centroid, labels
+
+    def _more_tags(self):
+        return {
+            "_xfail_checks": {
+                "check_estimators_dtypes": "raises nan error",
+                "check_fit2d_1sample": "_scale_normalize fails",
+                "check_fit2d_1feature": "raises apply_along_axis error",
+                "check_estimator_sparse_matrix": "does not fail gracefully",
+                "check_estimator_sparse_array": "does not fail gracefully",
+                "check_methods_subset_invariance": "empty array passed inside",
+                "check_dont_overwrite_parameters": "empty array passed inside",
+                "check_fit2d_predict1d": "empty array passed inside",
+            }
+        }
+
+
+class SpectralCoclustering(BaseSpectral):
+    """Spectral Co-Clustering algorithm (Dhillon, 2001).
+
+    Clusters rows and columns of an array `X` to solve the relaxed
+    normalized cut of the bipartite graph created from `X` as follows:
+    the edge between row vertex `i` and column vertex `j` has weight
+    `X[i, j]`.
+
+    The resulting bicluster structure is block-diagonal, since each
+    row and each column belongs to exactly one bicluster.
+
+    Supports sparse matrices, as long as they are nonnegative.
+
+    Read more in the :ref:`User Guide <spectral_coclustering>`.
+
+    Parameters
+    ----------
+    n_clusters : int, default=3
+        The number of biclusters to find.
+
+    svd_method : {'randomized', 'arpack'}, default='randomized'
+        Selects the algorithm for finding singular vectors. May be
+        'randomized' or 'arpack'. If 'randomized', use
+        :func:`sklearn.utils.extmath.randomized_svd`, which may be faster
+        for large matrices. If 'arpack', use
+        :func:`scipy.sparse.linalg.svds`, which is more accurate, but
+        possibly slower in some cases.
+
+    n_svd_vecs : int, default=None
+        Number of vectors to use in calculating the SVD. Corresponds
+        to `ncv` when `svd_method=arpack` and `n_oversamples` when
+        `svd_method` is 'randomized`.
+
+    mini_batch : bool, default=False
+        Whether to use mini-batch k-means, which is faster but may get
+        different results.
+
+    init : {'k-means++', 'random'}, or ndarray of shape \
+            (n_clusters, n_features), default='k-means++'
+        Method for initialization of k-means algorithm; defaults to
+        'k-means++'.
+
+    n_init : int, default=10
+        Number of random initializations that are tried with the
+        k-means algorithm.
+
+        If mini-batch k-means is used, the best initialization is
+        chosen and the algorithm runs once. Otherwise, the algorithm
+        is run for each initialization and the best solution chosen.
+
+    random_state : int, RandomState instance, default=None
+        Used for randomizing the singular value decomposition and the k-means
+        initialization. Use an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    rows_ : array-like of shape (n_row_clusters, n_rows)
+        Results of the clustering. `rows[i, r]` is True if
+        cluster `i` contains row `r`. Available only after calling ``fit``.
+
+    columns_ : array-like of shape (n_column_clusters, n_columns)
+        Results of the clustering, like `rows`.
+
+    row_labels_ : array-like of shape (n_rows,)
+        The bicluster label of each row.
+
+    column_labels_ : array-like of shape (n_cols,)
+        The bicluster label of each column.
+
+    biclusters_ : tuple of two ndarrays
+        The tuple contains the `rows_` and `columns_` arrays.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    SpectralBiclustering : Partitions rows and columns under the assumption
+        that the data has an underlying checkerboard structure.
+
+    References
+    ----------
+    * :doi:`Dhillon, Inderjit S, 2001. Co-clustering documents and words using
+      bipartite spectral graph partitioning.
+      <10.1145/502512.502550>`
+
+    Examples
+    --------
+    >>> from sklearn.cluster import SpectralCoclustering
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)
+    >>> clustering.row_labels_ #doctest: +SKIP
+    array([0, 1, 1, 0, 0, 0], dtype=int32)
+    >>> clustering.column_labels_ #doctest: +SKIP
+    array([0, 0], dtype=int32)
+    >>> clustering
+    SpectralCoclustering(n_clusters=2, random_state=0)
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSpectral._parameter_constraints,
+        "n_clusters": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        n_clusters=3,
+        *,
+        svd_method="randomized",
+        n_svd_vecs=None,
+        mini_batch=False,
+        init="k-means++",
+        n_init=10,
+        random_state=None,
+    ):
+        super().__init__(
+            n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
+        )
+
+    def _check_parameters(self, n_samples):
+        if self.n_clusters > n_samples:
+            raise ValueError(
+                f"n_clusters should be <= n_samples={n_samples}. Got"
+                f" {self.n_clusters} instead."
+            )
+
+    def _fit(self, X):
+        normalized_data, row_diag, col_diag = _scale_normalize(X)
+        n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
+        u, v = self._svd(normalized_data, n_sv, n_discard=1)
+        z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))
+
+        _, labels = self._k_means(z, self.n_clusters)
+
+        n_rows = X.shape[0]
+        self.row_labels_ = labels[:n_rows]
+        self.column_labels_ = labels[n_rows:]
+
+        self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])
+        self.columns_ = np.vstack(
+            [self.column_labels_ == c for c in range(self.n_clusters)]
+        )
+
+
+class SpectralBiclustering(BaseSpectral):
+    """Spectral biclustering (Kluger, 2003).
+
+    Partitions rows and columns under the assumption that the data has
+    an underlying checkerboard structure. For instance, if there are
+    two row partitions and three column partitions, each row will
+    belong to three biclusters, and each column will belong to two
+    biclusters. The outer product of the corresponding row and column
+    label vectors gives this checkerboard structure.
+
+    Read more in the :ref:`User Guide <spectral_biclustering>`.
+
+    Parameters
+    ----------
+    n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
+        The number of row and column clusters in the checkerboard
+        structure.
+
+    method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
+        Method of normalizing and converting singular vectors into
+        biclusters. May be one of 'scale', 'bistochastic', or 'log'.
+        The authors recommend using 'log'. If the data is sparse,
+        however, log normalization will not work, which is why the
+        default is 'bistochastic'.
+
+        .. warning::
+           if `method='log'`, the data must not be sparse.
+
+    n_components : int, default=6
+        Number of singular vectors to check.
+
+    n_best : int, default=3
+        Number of best singular vectors to which to project the data
+        for clustering.
+
+    svd_method : {'randomized', 'arpack'}, default='randomized'
+        Selects the algorithm for finding singular vectors. May be
+        'randomized' or 'arpack'. If 'randomized', uses
+        :func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
+        for large matrices. If 'arpack', uses
+        `scipy.sparse.linalg.svds`, which is more accurate, but
+        possibly slower in some cases.
+
+    n_svd_vecs : int, default=None
+        Number of vectors to use in calculating the SVD. Corresponds
+        to `ncv` when `svd_method=arpack` and `n_oversamples` when
+        `svd_method` is 'randomized`.
+
+    mini_batch : bool, default=False
+        Whether to use mini-batch k-means, which is faster but may get
+        different results.
+
+    init : {'k-means++', 'random'} or ndarray of shape (n_clusters, n_features), \
+            default='k-means++'
+        Method for initialization of k-means algorithm; defaults to
+        'k-means++'.
+
+    n_init : int, default=10
+        Number of random initializations that are tried with the
+        k-means algorithm.
+
+        If mini-batch k-means is used, the best initialization is
+        chosen and the algorithm runs once. Otherwise, the algorithm
+        is run for each initialization and the best solution chosen.
+
+    random_state : int, RandomState instance, default=None
+        Used for randomizing the singular value decomposition and the k-means
+        initialization. Use an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    rows_ : array-like of shape (n_row_clusters, n_rows)
+        Results of the clustering. `rows[i, r]` is True if
+        cluster `i` contains row `r`. Available only after calling ``fit``.
+
+    columns_ : array-like of shape (n_column_clusters, n_columns)
+        Results of the clustering, like `rows`.
+
+    row_labels_ : array-like of shape (n_rows,)
+        Row partition labels.
+
+    column_labels_ : array-like of shape (n_cols,)
+        Column partition labels.
+
+    biclusters_ : tuple of two ndarrays
+        The tuple contains the `rows_` and `columns_` arrays.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    SpectralCoclustering : Spectral Co-Clustering algorithm (Dhillon, 2001).
+
+    References
+    ----------
+
+    * :doi:`Kluger, Yuval, et. al., 2003. Spectral biclustering of microarray
+      data: coclustering genes and conditions.
+      <10.1101/gr.648603>`
+
+    Examples
+    --------
+    >>> from sklearn.cluster import SpectralBiclustering
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)
+    >>> clustering.row_labels_
+    array([1, 1, 1, 0, 0, 0], dtype=int32)
+    >>> clustering.column_labels_
+    array([1, 0], dtype=int32)
+    >>> clustering
+    SpectralBiclustering(n_clusters=2, random_state=0)
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSpectral._parameter_constraints,
+        "n_clusters": [Interval(Integral, 1, None, closed="left"), tuple],
+        "method": [StrOptions({"bistochastic", "scale", "log"})],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "n_best": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        n_clusters=3,
+        *,
+        method="bistochastic",
+        n_components=6,
+        n_best=3,
+        svd_method="randomized",
+        n_svd_vecs=None,
+        mini_batch=False,
+        init="k-means++",
+        n_init=10,
+        random_state=None,
+    ):
+        super().__init__(
+            n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
+        )
+        self.method = method
+        self.n_components = n_components
+        self.n_best = n_best
+
+    def _check_parameters(self, n_samples):
+        if isinstance(self.n_clusters, Integral):
+            if self.n_clusters > n_samples:
+                raise ValueError(
+                    f"n_clusters should be <= n_samples={n_samples}. Got"
+                    f" {self.n_clusters} instead."
+                )
+        else:  # tuple
+            try:
+                n_row_clusters, n_column_clusters = self.n_clusters
+                check_scalar(
+                    n_row_clusters,
+                    "n_row_clusters",
+                    target_type=Integral,
+                    min_val=1,
+                    max_val=n_samples,
+                )
+                check_scalar(
+                    n_column_clusters,
+                    "n_column_clusters",
+                    target_type=Integral,
+                    min_val=1,
+                    max_val=n_samples,
+                )
+            except (ValueError, TypeError) as e:
+                raise ValueError(
+                    "Incorrect parameter n_clusters has value:"
+                    f" {self.n_clusters}. It should either be a single integer"
+                    " or an iterable with two integers:"
+                    " (n_row_clusters, n_column_clusters)"
+                    " And the values are should be in the"
+                    " range: (1, n_samples)"
+                ) from e
+
+        if self.n_best > self.n_components:
+            raise ValueError(
+                f"n_best={self.n_best} must be <= n_components={self.n_components}."
+            )
+
+    def _fit(self, X):
+        n_sv = self.n_components
+        if self.method == "bistochastic":
+            normalized_data = _bistochastic_normalize(X)
+            n_sv += 1
+        elif self.method == "scale":
+            normalized_data, _, _ = _scale_normalize(X)
+            n_sv += 1
+        elif self.method == "log":
+            normalized_data = _log_normalize(X)
+        n_discard = 0 if self.method == "log" else 1
+        u, v = self._svd(normalized_data, n_sv, n_discard)
+        ut = u.T
+        vt = v.T
+
+        try:
+            n_row_clusters, n_col_clusters = self.n_clusters
+        except TypeError:
+            n_row_clusters = n_col_clusters = self.n_clusters
+
+        best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters)
+
+        best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters)
+
+        self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters)
+
+        self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters)
+
+        self.rows_ = np.vstack(
+            [
+                self.row_labels_ == label
+                for label in range(n_row_clusters)
+                for _ in range(n_col_clusters)
+            ]
+        )
+        self.columns_ = np.vstack(
+            [
+                self.column_labels_ == label
+                for _ in range(n_row_clusters)
+                for label in range(n_col_clusters)
+            ]
+        )
+
+    def _fit_best_piecewise(self, vectors, n_best, n_clusters):
+        """Find the ``n_best`` vectors that are best approximated by piecewise
+        constant vectors.
+
+        The piecewise vectors are found by k-means; the best is chosen
+        according to Euclidean distance.
+
+        """
+
+        def make_piecewise(v):
+            centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters)
+            return centroid[labels].ravel()
+
+        piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors)
+        dists = np.apply_along_axis(norm, axis=1, arr=(vectors - piecewise_vectors))
+        result = vectors[np.argsort(dists)[:n_best]]
+        return result
+
+    def _project_and_cluster(self, data, vectors, n_clusters):
+        """Project ``data`` to ``vectors`` and cluster the result."""
+        projected = safe_sparse_dot(data, vectors)
+        _, labels = self._k_means(projected, n_clusters)
+        return labels
@@ -0,0 +1,741 @@
+# Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
+#          Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
+#          Joel Nothman <joel.nothman@gmail.com>
+# License: BSD 3 clause
+
+import warnings
+from math import sqrt
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import sparse
+
+from .._config import config_context
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    ClusterMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..exceptions import ConvergenceWarning
+from ..metrics import pairwise_distances_argmin
+from ..metrics.pairwise import euclidean_distances
+from ..utils._param_validation import Interval
+from ..utils.extmath import row_norms
+from ..utils.validation import check_is_fitted
+from . import AgglomerativeClustering
+
+
+def _iterate_sparse_X(X):
+    """This little hack returns a densified row when iterating over a sparse
+    matrix, instead of constructing a sparse matrix for every row that is
+    expensive.
+    """
+    n_samples = X.shape[0]
+    X_indices = X.indices
+    X_data = X.data
+    X_indptr = X.indptr
+
+    for i in range(n_samples):
+        row = np.zeros(X.shape[1])
+        startptr, endptr = X_indptr[i], X_indptr[i + 1]
+        nonzero_indices = X_indices[startptr:endptr]
+        row[nonzero_indices] = X_data[startptr:endptr]
+        yield row
+
+
+def _split_node(node, threshold, branching_factor):
+    """The node has to be split if there is no place for a new subcluster
+    in the node.
+    1. Two empty nodes and two empty subclusters are initialized.
+    2. The pair of distant subclusters are found.
+    3. The properties of the empty subclusters and nodes are updated
+       according to the nearest distance between the subclusters to the
+       pair of distant subclusters.
+    4. The two nodes are set as children to the two subclusters.
+    """
+    new_subcluster1 = _CFSubcluster()
+    new_subcluster2 = _CFSubcluster()
+    new_node1 = _CFNode(
+        threshold=threshold,
+        branching_factor=branching_factor,
+        is_leaf=node.is_leaf,
+        n_features=node.n_features,
+        dtype=node.init_centroids_.dtype,
+    )
+    new_node2 = _CFNode(
+        threshold=threshold,
+        branching_factor=branching_factor,
+        is_leaf=node.is_leaf,
+        n_features=node.n_features,
+        dtype=node.init_centroids_.dtype,
+    )
+    new_subcluster1.child_ = new_node1
+    new_subcluster2.child_ = new_node2
+
+    if node.is_leaf:
+        if node.prev_leaf_ is not None:
+            node.prev_leaf_.next_leaf_ = new_node1
+        new_node1.prev_leaf_ = node.prev_leaf_
+        new_node1.next_leaf_ = new_node2
+        new_node2.prev_leaf_ = new_node1
+        new_node2.next_leaf_ = node.next_leaf_
+        if node.next_leaf_ is not None:
+            node.next_leaf_.prev_leaf_ = new_node2
+
+    dist = euclidean_distances(
+        node.centroids_, Y_norm_squared=node.squared_norm_, squared=True
+    )
+    n_clusters = dist.shape[0]
+
+    farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))
+    node1_dist, node2_dist = dist[(farthest_idx,)]
+
+    node1_closer = node1_dist < node2_dist
+    # make sure node1 is closest to itself even if all distances are equal.
+    # This can only happen when all node.centroids_ are duplicates leading to all
+    # distances between centroids being zero.
+    node1_closer[farthest_idx[0]] = True
+
+    for idx, subcluster in enumerate(node.subclusters_):
+        if node1_closer[idx]:
+            new_node1.append_subcluster(subcluster)
+            new_subcluster1.update(subcluster)
+        else:
+            new_node2.append_subcluster(subcluster)
+            new_subcluster2.update(subcluster)
+    return new_subcluster1, new_subcluster2
+
+
+class _CFNode:
+    """Each node in a CFTree is called a CFNode.
+
+    The CFNode can have a maximum of branching_factor
+    number of CFSubclusters.
+
+    Parameters
+    ----------
+    threshold : float
+        Threshold needed for a new subcluster to enter a CFSubcluster.
+
+    branching_factor : int
+        Maximum number of CF subclusters in each node.
+
+    is_leaf : bool
+        We need to know if the CFNode is a leaf or not, in order to
+        retrieve the final subclusters.
+
+    n_features : int
+        The number of features.
+
+    Attributes
+    ----------
+    subclusters_ : list
+        List of subclusters for a particular CFNode.
+
+    prev_leaf_ : _CFNode
+        Useful only if is_leaf is True.
+
+    next_leaf_ : _CFNode
+        next_leaf. Useful only if is_leaf is True.
+        the final subclusters.
+
+    init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
+        Manipulate ``init_centroids_`` throughout rather than centroids_ since
+        the centroids are just a view of the ``init_centroids_`` .
+
+    init_sq_norm_ : ndarray of shape (branching_factor + 1,)
+        manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
+
+    centroids_ : ndarray of shape (branching_factor + 1, n_features)
+        View of ``init_centroids_``.
+
+    squared_norm_ : ndarray of shape (branching_factor + 1,)
+        View of ``init_sq_norm_``.
+
+    """
+
+    def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
+        self.threshold = threshold
+        self.branching_factor = branching_factor
+        self.is_leaf = is_leaf
+        self.n_features = n_features
+
+        # The list of subclusters, centroids and squared norms
+        # to manipulate throughout.
+        self.subclusters_ = []
+        self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
+        self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype)
+        self.squared_norm_ = []
+        self.prev_leaf_ = None
+        self.next_leaf_ = None
+
+    def append_subcluster(self, subcluster):
+        n_samples = len(self.subclusters_)
+        self.subclusters_.append(subcluster)
+        self.init_centroids_[n_samples] = subcluster.centroid_
+        self.init_sq_norm_[n_samples] = subcluster.sq_norm_
+
+        # Keep centroids and squared norm as views. In this way
+        # if we change init_centroids and init_sq_norm_, it is
+        # sufficient,
+        self.centroids_ = self.init_centroids_[: n_samples + 1, :]
+        self.squared_norm_ = self.init_sq_norm_[: n_samples + 1]
+
+    def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
+        """Remove a subcluster from a node and update it with the
+        split subclusters.
+        """
+        ind = self.subclusters_.index(subcluster)
+        self.subclusters_[ind] = new_subcluster1
+        self.init_centroids_[ind] = new_subcluster1.centroid_
+        self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
+        self.append_subcluster(new_subcluster2)
+
+    def insert_cf_subcluster(self, subcluster):
+        """Insert a new subcluster into the node."""
+        if not self.subclusters_:
+            self.append_subcluster(subcluster)
+            return False
+
+        threshold = self.threshold
+        branching_factor = self.branching_factor
+        # We need to find the closest subcluster among all the
+        # subclusters so that we can insert our new subcluster.
+        dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
+        dist_matrix *= -2.0
+        dist_matrix += self.squared_norm_
+        closest_index = np.argmin(dist_matrix)
+        closest_subcluster = self.subclusters_[closest_index]
+
+        # If the subcluster has a child, we need a recursive strategy.
+        if closest_subcluster.child_ is not None:
+            split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)
+
+            if not split_child:
+                # If it is determined that the child need not be split, we
+                # can just update the closest_subcluster
+                closest_subcluster.update(subcluster)
+                self.init_centroids_[closest_index] = self.subclusters_[
+                    closest_index
+                ].centroid_
+                self.init_sq_norm_[closest_index] = self.subclusters_[
+                    closest_index
+                ].sq_norm_
+                return False
+
+            # things not too good. we need to redistribute the subclusters in
+            # our child node, and add a new subcluster in the parent
+            # subcluster to accommodate the new child.
+            else:
+                new_subcluster1, new_subcluster2 = _split_node(
+                    closest_subcluster.child_,
+                    threshold,
+                    branching_factor,
+                )
+                self.update_split_subclusters(
+                    closest_subcluster, new_subcluster1, new_subcluster2
+                )
+
+                if len(self.subclusters_) > self.branching_factor:
+                    return True
+                return False
+
+        # good to go!
+        else:
+            merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
+            if merged:
+                self.init_centroids_[closest_index] = closest_subcluster.centroid_
+                self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_
+                return False
+
+            # not close to any other subclusters, and we still
+            # have space, so add.
+            elif len(self.subclusters_) < self.branching_factor:
+                self.append_subcluster(subcluster)
+                return False
+
+            # We do not have enough space nor is it closer to an
+            # other subcluster. We need to split.
+            else:
+                self.append_subcluster(subcluster)
+                return True
+
+
+class _CFSubcluster:
+    """Each subcluster in a CFNode is called a CFSubcluster.
+
+    A CFSubcluster can have a CFNode has its child.
+
+    Parameters
+    ----------
+    linear_sum : ndarray of shape (n_features,), default=None
+        Sample. This is kept optional to allow initialization of empty
+        subclusters.
+
+    Attributes
+    ----------
+    n_samples_ : int
+        Number of samples that belong to each subcluster.
+
+    linear_sum_ : ndarray
+        Linear sum of all the samples in a subcluster. Prevents holding
+        all sample data in memory.
+
+    squared_sum_ : float
+        Sum of the squared l2 norms of all samples belonging to a subcluster.
+
+    centroid_ : ndarray of shape (branching_factor + 1, n_features)
+        Centroid of the subcluster. Prevent recomputing of centroids when
+        ``CFNode.centroids_`` is called.
+
+    child_ : _CFNode
+        Child Node of the subcluster. Once a given _CFNode is set as the child
+        of the _CFNode, it is set to ``self.child_``.
+
+    sq_norm_ : ndarray of shape (branching_factor + 1,)
+        Squared norm of the subcluster. Used to prevent recomputing when
+        pairwise minimum distances are computed.
+    """
+
+    def __init__(self, *, linear_sum=None):
+        if linear_sum is None:
+            self.n_samples_ = 0
+            self.squared_sum_ = 0.0
+            self.centroid_ = self.linear_sum_ = 0
+        else:
+            self.n_samples_ = 1
+            self.centroid_ = self.linear_sum_ = linear_sum
+            self.squared_sum_ = self.sq_norm_ = np.dot(
+                self.linear_sum_, self.linear_sum_
+            )
+        self.child_ = None
+
+    def update(self, subcluster):
+        self.n_samples_ += subcluster.n_samples_
+        self.linear_sum_ += subcluster.linear_sum_
+        self.squared_sum_ += subcluster.squared_sum_
+        self.centroid_ = self.linear_sum_ / self.n_samples_
+        self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
+
+    def merge_subcluster(self, nominee_cluster, threshold):
+        """Check if a cluster is worthy enough to be merged. If
+        yes then merge.
+        """
+        new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
+        new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
+        new_n = self.n_samples_ + nominee_cluster.n_samples_
+        new_centroid = (1 / new_n) * new_ls
+        new_sq_norm = np.dot(new_centroid, new_centroid)
+
+        # The squared radius of the cluster is defined:
+        #   r^2  = sum_i ||x_i - c||^2 / n
+        # with x_i the n points assigned to the cluster and c its centroid:
+        #   c = sum_i x_i / n
+        # This can be expanded to:
+        #   r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
+        # and therefore simplifies to:
+        #   r^2 = sum_i ||x_i||^2 / n - ||c||^2
+        sq_radius = new_ss / new_n - new_sq_norm
+
+        if sq_radius <= threshold**2:
+            (
+                self.n_samples_,
+                self.linear_sum_,
+                self.squared_sum_,
+                self.centroid_,
+                self.sq_norm_,
+            ) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)
+            return True
+        return False
+
+    @property
+    def radius(self):
+        """Return radius of the subcluster"""
+        # Because of numerical issues, this could become negative
+        sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
+        return sqrt(max(0, sq_radius))
+
+
+class Birch(
+    ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
+):
+    """Implements the BIRCH clustering algorithm.
+
+    It is a memory-efficient, online-learning algorithm provided as an
+    alternative to :class:`MiniBatchKMeans`. It constructs a tree
+    data structure with the cluster centroids being read off the leaf.
+    These can be either the final cluster centroids or can be provided as input
+    to another clustering algorithm such as :class:`AgglomerativeClustering`.
+
+    Read more in the :ref:`User Guide <birch>`.
+
+    .. versionadded:: 0.16
+
+    Parameters
+    ----------
+    threshold : float, default=0.5
+        The radius of the subcluster obtained by merging a new sample and the
+        closest subcluster should be lesser than the threshold. Otherwise a new
+        subcluster is started. Setting this value to be very low promotes
+        splitting and vice-versa.
+
+    branching_factor : int, default=50
+        Maximum number of CF subclusters in each node. If a new samples enters
+        such that the number of subclusters exceed the branching_factor then
+        that node is split into two nodes with the subclusters redistributed
+        in each. The parent subcluster of that node is removed and two new
+        subclusters are added as parents of the 2 split nodes.
+
+    n_clusters : int, instance of sklearn.cluster model or None, default=3
+        Number of clusters after the final clustering step, which treats the
+        subclusters from the leaves as new samples.
+
+        - `None` : the final clustering step is not performed and the
+          subclusters are returned as they are.
+
+        - :mod:`sklearn.cluster` Estimator : If a model is provided, the model
+          is fit treating the subclusters as new samples and the initial data
+          is mapped to the label of the closest subcluster.
+
+        - `int` : the model fit is :class:`AgglomerativeClustering` with
+          `n_clusters` set to be equal to the int.
+
+    compute_labels : bool, default=True
+        Whether or not to compute labels for each fit.
+
+    copy : bool, default=True
+        Whether or not to make a copy of the given data. If set to False,
+        the initial data will be overwritten.
+
+    Attributes
+    ----------
+    root_ : _CFNode
+        Root of the CFTree.
+
+    dummy_leaf_ : _CFNode
+        Start pointer to all the leaves.
+
+    subcluster_centers_ : ndarray
+        Centroids of all subclusters read directly from the leaves.
+
+    subcluster_labels_ : ndarray
+        Labels assigned to the centroids of the subclusters after
+        they are clustered globally.
+
+    labels_ : ndarray of shape (n_samples,)
+        Array of labels assigned to the input data.
+        if partial_fit is used instead of fit, they are assigned to the
+        last batch of data.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    MiniBatchKMeans : Alternative implementation that does incremental updates
+        of the centers' positions using mini-batches.
+
+    Notes
+    -----
+    The tree data structure consists of nodes with each node consisting of
+    a number of subclusters. The maximum number of subclusters in a node
+    is determined by the branching factor. Each subcluster maintains a
+    linear sum, squared sum and the number of samples in that subcluster.
+    In addition, each subcluster can also have a node as its child, if the
+    subcluster is not a member of a leaf node.
+
+    For a new point entering the root, it is merged with the subcluster closest
+    to it and the linear sum, squared sum and the number of samples of that
+    subcluster are updated. This is done recursively till the properties of
+    the leaf node are updated.
+
+    References
+    ----------
+    * Tian Zhang, Raghu Ramakrishnan, Maron Livny
+      BIRCH: An efficient data clustering method for large databases.
+      https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
+
+    * Roberto Perdisci
+      JBirch - Java implementation of BIRCH clustering algorithm
+      https://code.google.com/archive/p/jbirch
+
+    Examples
+    --------
+    >>> from sklearn.cluster import Birch
+    >>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
+    >>> brc = Birch(n_clusters=None)
+    >>> brc.fit(X)
+    Birch(n_clusters=None)
+    >>> brc.predict(X)
+    array([0, 0, 0, 1, 1, 1])
+    """
+
+    _parameter_constraints: dict = {
+        "threshold": [Interval(Real, 0.0, None, closed="neither")],
+        "branching_factor": [Interval(Integral, 1, None, closed="neither")],
+        "n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")],
+        "compute_labels": ["boolean"],
+        "copy": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        threshold=0.5,
+        branching_factor=50,
+        n_clusters=3,
+        compute_labels=True,
+        copy=True,
+    ):
+        self.threshold = threshold
+        self.branching_factor = branching_factor
+        self.n_clusters = n_clusters
+        self.compute_labels = compute_labels
+        self.copy = copy
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """
+        Build a CF Tree for the input data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        return self._fit(X, partial=False)
+
+    def _fit(self, X, partial):
+        has_root = getattr(self, "root_", None)
+        first_call = not (partial and has_root)
+
+        X = self._validate_data(
+            X,
+            accept_sparse="csr",
+            copy=self.copy,
+            reset=first_call,
+            dtype=[np.float64, np.float32],
+        )
+        threshold = self.threshold
+        branching_factor = self.branching_factor
+
+        n_samples, n_features = X.shape
+
+        # If partial_fit is called for the first time or fit is called, we
+        # start a new tree.
+        if first_call:
+            # The first root is the leaf. Manipulate this object throughout.
+            self.root_ = _CFNode(
+                threshold=threshold,
+                branching_factor=branching_factor,
+                is_leaf=True,
+                n_features=n_features,
+                dtype=X.dtype,
+            )
+
+            # To enable getting back subclusters.
+            self.dummy_leaf_ = _CFNode(
+                threshold=threshold,
+                branching_factor=branching_factor,
+                is_leaf=True,
+                n_features=n_features,
+                dtype=X.dtype,
+            )
+            self.dummy_leaf_.next_leaf_ = self.root_
+            self.root_.prev_leaf_ = self.dummy_leaf_
+
+        # Cannot vectorize. Enough to convince to use cython.
+        if not sparse.issparse(X):
+            iter_func = iter
+        else:
+            iter_func = _iterate_sparse_X
+
+        for sample in iter_func(X):
+            subcluster = _CFSubcluster(linear_sum=sample)
+            split = self.root_.insert_cf_subcluster(subcluster)
+
+            if split:
+                new_subcluster1, new_subcluster2 = _split_node(
+                    self.root_, threshold, branching_factor
+                )
+                del self.root_
+                self.root_ = _CFNode(
+                    threshold=threshold,
+                    branching_factor=branching_factor,
+                    is_leaf=False,
+                    n_features=n_features,
+                    dtype=X.dtype,
+                )
+                self.root_.append_subcluster(new_subcluster1)
+                self.root_.append_subcluster(new_subcluster2)
+
+        centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
+        self.subcluster_centers_ = centroids
+        self._n_features_out = self.subcluster_centers_.shape[0]
+
+        self._global_clustering(X)
+        return self
+
+    def _get_leaves(self):
+        """
+        Retrieve the leaves of the CF Node.
+
+        Returns
+        -------
+        leaves : list of shape (n_leaves,)
+            List of the leaf nodes.
+        """
+        leaf_ptr = self.dummy_leaf_.next_leaf_
+        leaves = []
+        while leaf_ptr is not None:
+            leaves.append(leaf_ptr)
+            leaf_ptr = leaf_ptr.next_leaf_
+        return leaves
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X=None, y=None):
+        """
+        Online learning. Prevents rebuilding of CFTree from scratch.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), \
+            default=None
+            Input data. If X is not provided, only the global clustering
+            step is done.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        if X is None:
+            # Perform just the final global clustering step.
+            self._global_clustering()
+            return self
+        else:
+            return self._fit(X, partial=True)
+
+    def _check_fit(self, X):
+        check_is_fitted(self)
+
+        if (
+            hasattr(self, "subcluster_centers_")
+            and X.shape[1] != self.subcluster_centers_.shape[1]
+        ):
+            raise ValueError(
+                "Training data and predicted data do not have same number of features."
+            )
+
+    def predict(self, X):
+        """
+        Predict data using the ``centroids_`` of subclusters.
+
+        Avoid computation of the row norms of X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        Returns
+        -------
+        labels : ndarray of shape(n_samples,)
+            Labelled data.
+        """
+        check_is_fitted(self)
+        X = self._validate_data(X, accept_sparse="csr", reset=False)
+        return self._predict(X)
+
+    def _predict(self, X):
+        """Predict data using the ``centroids_`` of subclusters."""
+        kwargs = {"Y_norm_squared": self._subcluster_norms}
+
+        with config_context(assume_finite=True):
+            argmin = pairwise_distances_argmin(
+                X, self.subcluster_centers_, metric_kwargs=kwargs
+            )
+        return self.subcluster_labels_[argmin]
+
+    def transform(self, X):
+        """
+        Transform X into subcluster centroids dimension.
+
+        Each dimension represents the distance from the sample point to each
+        cluster centroid.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        Returns
+        -------
+        X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
+            Transformed data.
+        """
+        check_is_fitted(self)
+        X = self._validate_data(X, accept_sparse="csr", reset=False)
+        with config_context(assume_finite=True):
+            return euclidean_distances(X, self.subcluster_centers_)
+
+    def _global_clustering(self, X=None):
+        """
+        Global clustering for the subclusters obtained after fitting
+        """
+        clusterer = self.n_clusters
+        centroids = self.subcluster_centers_
+        compute_labels = (X is not None) and self.compute_labels
+
+        # Preprocessing for the global clustering.
+        not_enough_centroids = False
+        if isinstance(clusterer, Integral):
+            clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
+            # There is no need to perform the global clustering step.
+            if len(centroids) < self.n_clusters:
+                not_enough_centroids = True
+
+        # To use in predict to avoid recalculation.
+        self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
+
+        if clusterer is None or not_enough_centroids:
+            self.subcluster_labels_ = np.arange(len(centroids))
+            if not_enough_centroids:
+                warnings.warn(
+                    "Number of subclusters found (%d) by BIRCH is less "
+                    "than (%d). Decrease the threshold."
+                    % (len(centroids), self.n_clusters),
+                    ConvergenceWarning,
+                )
+        else:
+            # The global clustering step that clusters the subclusters of
+            # the leaves. It assumes the centroids of the subclusters as
+            # samples and finds the final centroids.
+            self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
+
+        if compute_labels:
+            self.labels_ = self._predict(X)
+
+    def _more_tags(self):
+        return {"preserves_dtype": [np.float64, np.float32]}
@@ -0,0 +1,530 @@
+"""Bisecting K-means clustering."""
+
+# Author: Michal Krawczyk <mkrwczyk.1@gmail.com>
+
+import warnings
+
+import numpy as np
+import scipy.sparse as sp
+
+from ..base import _fit_context
+from ..utils._openmp_helpers import _openmp_effective_n_threads
+from ..utils._param_validation import Integral, Interval, StrOptions
+from ..utils.extmath import row_norms
+from ..utils.validation import _check_sample_weight, check_is_fitted, check_random_state
+from ._k_means_common import _inertia_dense, _inertia_sparse
+from ._kmeans import (
+    _BaseKMeans,
+    _kmeans_single_elkan,
+    _kmeans_single_lloyd,
+    _labels_inertia_threadpool_limit,
+)
+
+
+class _BisectingTree:
+    """Tree structure representing the hierarchical clusters of BisectingKMeans."""
+
+    def __init__(self, center, indices, score):
+        """Create a new cluster node in the tree.
+
+        The node holds the center of this cluster and the indices of the data points
+        that belong to it.
+        """
+        self.center = center
+        self.indices = indices
+        self.score = score
+
+        self.left = None
+        self.right = None
+
+    def split(self, labels, centers, scores):
+        """Split the cluster node into two subclusters."""
+        self.left = _BisectingTree(
+            indices=self.indices[labels == 0], center=centers[0], score=scores[0]
+        )
+        self.right = _BisectingTree(
+            indices=self.indices[labels == 1], center=centers[1], score=scores[1]
+        )
+
+        # reset the indices attribute to save memory
+        self.indices = None
+
+    def get_cluster_to_bisect(self):
+        """Return the cluster node to bisect next.
+
+        It's based on the score of the cluster, which can be either the number of
+        data points assigned to that cluster or the inertia of that cluster
+        (see `bisecting_strategy` for details).
+        """
+        max_score = None
+
+        for cluster_leaf in self.iter_leaves():
+            if max_score is None or cluster_leaf.score > max_score:
+                max_score = cluster_leaf.score
+                best_cluster_leaf = cluster_leaf
+
+        return best_cluster_leaf
+
+    def iter_leaves(self):
+        """Iterate over all the cluster leaves in the tree."""
+        if self.left is None:
+            yield self
+        else:
+            yield from self.left.iter_leaves()
+            yield from self.right.iter_leaves()
+
+
+class BisectingKMeans(_BaseKMeans):
+    """Bisecting K-Means clustering.
+
+    Read more in the :ref:`User Guide <bisect_k_means>`.
+
+    .. versionadded:: 1.1
+
+    Parameters
+    ----------
+    n_clusters : int, default=8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    init : {'k-means++', 'random'} or callable, default='random'
+        Method for initialization:
+
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+
+        'random': choose `n_clusters` observations (rows) at random from data
+        for the initial centroids.
+
+        If a callable is passed, it should take arguments X, n_clusters and a
+        random state and return an initialization.
+
+    n_init : int, default=1
+        Number of time the inner k-means algorithm will be run with different
+        centroid seeds in each bisection.
+        That will result producing for each bisection best output of n_init
+        consecutive runs in terms of inertia.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for centroid initialization
+        in inner K-Means. Use an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the inner k-means algorithm at each
+        bisection.
+
+    verbose : int, default=0
+        Verbosity mode.
+
+    tol : float, default=1e-4
+        Relative tolerance with regards to Frobenius norm of the difference
+        in the cluster centers of two consecutive iterations  to declare
+        convergence. Used in inner k-means algorithm at each bisection to pick
+        best possible clusters.
+
+    copy_x : bool, default=True
+        When pre-computing distances it is more numerically accurate to center
+        the data first. If copy_x is True (default), then the original data is
+        not modified. If False, the original data is modified, and put back
+        before the function returns, but small numerical differences may be
+        introduced by subtracting and then adding the data mean. Note that if
+        the original data is not C-contiguous, a copy will be made even if
+        copy_x is False. If the original data is sparse, but not in CSR format,
+        a copy will be made even if copy_x is False.
+
+    algorithm : {"lloyd", "elkan"}, default="lloyd"
+        Inner K-means algorithm used in bisection.
+        The classical EM-style algorithm is `"lloyd"`.
+        The `"elkan"` variation can be more efficient on some datasets with
+        well-defined clusters, by using the triangle inequality. However it's
+        more memory intensive due to the allocation of an extra array of shape
+        `(n_samples, n_clusters)`.
+
+    bisecting_strategy : {"biggest_inertia", "largest_cluster"},\
+            default="biggest_inertia"
+        Defines how bisection should be performed:
+
+         - "biggest_inertia" means that BisectingKMeans will always check
+            all calculated cluster for cluster with biggest SSE
+            (Sum of squared errors) and bisect it. This approach concentrates on
+            precision, but may be costly in terms of execution time (especially for
+            larger amount of data points).
+
+         - "largest_cluster" - BisectingKMeans will always split cluster with
+            largest amount of points assigned to it from all clusters
+            previously calculated. That should work faster than picking by SSE
+            ('biggest_inertia') and may produce similar results in most cases.
+
+    Attributes
+    ----------
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
+        Coordinates of cluster centers. If the algorithm stops before fully
+        converging (see ``tol`` and ``max_iter``), these will not be
+        consistent with ``labels_``.
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point.
+
+    inertia_ : float
+        Sum of squared distances of samples to their closest cluster center,
+        weighted by the sample weights if provided.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+    See Also
+    --------
+    KMeans : Original implementation of K-Means algorithm.
+
+    Notes
+    -----
+    It might be inefficient when n_cluster is less than 3, due to unnecessary
+    calculations for that case.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import BisectingKMeans
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [10, 1], [3, 1],
+    ...               [10, 0], [2, 1], [10, 2],
+    ...               [10, 8], [10, 9], [10, 10]])
+    >>> bisect_means = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
+    >>> bisect_means.labels_
+    array([0, 2, 0, 2, 0, 2, 1, 1, 1], dtype=int32)
+    >>> bisect_means.predict([[0, 0], [12, 3]])
+    array([0, 2], dtype=int32)
+    >>> bisect_means.cluster_centers_
+    array([[ 2., 1.],
+           [10., 9.],
+           [10., 1.]])
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseKMeans._parameter_constraints,
+        "init": [StrOptions({"k-means++", "random"}), callable],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "copy_x": ["boolean"],
+        "algorithm": [StrOptions({"lloyd", "elkan"})],
+        "bisecting_strategy": [StrOptions({"biggest_inertia", "largest_cluster"})],
+    }
+
+    def __init__(
+        self,
+        n_clusters=8,
+        *,
+        init="random",
+        n_init=1,
+        random_state=None,
+        max_iter=300,
+        verbose=0,
+        tol=1e-4,
+        copy_x=True,
+        algorithm="lloyd",
+        bisecting_strategy="biggest_inertia",
+    ):
+        super().__init__(
+            n_clusters=n_clusters,
+            init=init,
+            max_iter=max_iter,
+            verbose=verbose,
+            random_state=random_state,
+            tol=tol,
+            n_init=n_init,
+        )
+
+        self.copy_x = copy_x
+        self.algorithm = algorithm
+        self.bisecting_strategy = bisecting_strategy
+
+    def _warn_mkl_vcomp(self, n_active_threads):
+        """Warn when vcomp and mkl are both present"""
+        warnings.warn(
+            "BisectingKMeans is known to have a memory leak on Windows "
+            "with MKL, when there are less chunks than available "
+            "threads. You can avoid it by setting the environment"
+            f" variable OMP_NUM_THREADS={n_active_threads}."
+        )
+
+    def _inertia_per_cluster(self, X, centers, labels, sample_weight):
+        """Calculate the sum of squared errors (inertia) per cluster.
+
+        Parameters
+        ----------
+        X : {ndarray, csr_matrix} of shape (n_samples, n_features)
+            The input samples.
+
+        centers : ndarray of shape (n_clusters=2, n_features)
+            The cluster centers.
+
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+
+        sample_weight : ndarray of shape (n_samples,)
+            The weights for each observation in X.
+
+        Returns
+        -------
+        inertia_per_cluster : ndarray of shape (n_clusters=2,)
+            Sum of squared errors (inertia) for each cluster.
+        """
+        n_clusters = centers.shape[0]  # = 2 since centers comes from a bisection
+        _inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
+
+        inertia_per_cluster = np.empty(n_clusters)
+        for label in range(n_clusters):
+            inertia_per_cluster[label] = _inertia(
+                X, sample_weight, centers, labels, self._n_threads, single_label=label
+            )
+
+        return inertia_per_cluster
+
+    def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
+        """Split a cluster into 2 subsclusters.
+
+        Parameters
+        ----------
+        X : {ndarray, csr_matrix} of shape (n_samples, n_features)
+            Training instances to cluster.
+
+        x_squared_norms : ndarray of shape (n_samples,)
+            Squared euclidean norm of each data point.
+
+        sample_weight : ndarray of shape (n_samples,)
+            The weights for each observation in X.
+
+        cluster_to_bisect : _BisectingTree node object
+            The cluster node to split.
+        """
+        X = X[cluster_to_bisect.indices]
+        x_squared_norms = x_squared_norms[cluster_to_bisect.indices]
+        sample_weight = sample_weight[cluster_to_bisect.indices]
+
+        best_inertia = None
+
+        # Split samples in X into 2 clusters.
+        # Repeating `n_init` times to obtain best clusters
+        for _ in range(self.n_init):
+            centers_init = self._init_centroids(
+                X,
+                x_squared_norms=x_squared_norms,
+                init=self.init,
+                random_state=self._random_state,
+                n_centroids=2,
+                sample_weight=sample_weight,
+            )
+
+            labels, inertia, centers, _ = self._kmeans_single(
+                X,
+                sample_weight,
+                centers_init,
+                max_iter=self.max_iter,
+                verbose=self.verbose,
+                tol=self.tol,
+                n_threads=self._n_threads,
+            )
+
+            # allow small tolerance on the inertia to accommodate for
+            # non-deterministic rounding errors due to parallel computation
+            if best_inertia is None or inertia < best_inertia * (1 - 1e-6):
+                best_labels = labels
+                best_centers = centers
+                best_inertia = inertia
+
+        if self.verbose:
+            print(f"New centroids from bisection: {best_centers}")
+
+        if self.bisecting_strategy == "biggest_inertia":
+            scores = self._inertia_per_cluster(
+                X, best_centers, best_labels, sample_weight
+            )
+        else:  # bisecting_strategy == "largest_cluster"
+            # Using minlength to make sure that we have the counts for both labels even
+            # if all samples are labelled 0.
+            scores = np.bincount(best_labels, minlength=2)
+
+        cluster_to_bisect.split(best_labels, best_centers, scores)
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute bisecting k-means clustering.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+
+            Training instances to cluster.
+
+            .. note:: The data will be converted to C ordering,
+                which will cause a memory copy
+                if the given data is not C-contiguous.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight. `sample_weight` is not used during
+            initialization if `init` is a callable.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        X = self._validate_data(
+            X,
+            accept_sparse="csr",
+            dtype=[np.float64, np.float32],
+            order="C",
+            copy=self.copy_x,
+            accept_large_sparse=False,
+        )
+
+        self._check_params_vs_input(X)
+
+        self._random_state = check_random_state(self.random_state)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        self._n_threads = _openmp_effective_n_threads()
+
+        if self.algorithm == "lloyd" or self.n_clusters == 1:
+            self._kmeans_single = _kmeans_single_lloyd
+            self._check_mkl_vcomp(X, X.shape[0])
+        else:
+            self._kmeans_single = _kmeans_single_elkan
+
+        # Subtract of mean of X for more accurate distance computations
+        if not sp.issparse(X):
+            self._X_mean = X.mean(axis=0)
+            X -= self._X_mean
+
+        # Initialize the hierarchical clusters tree
+        self._bisecting_tree = _BisectingTree(
+            indices=np.arange(X.shape[0]),
+            center=X.mean(axis=0),
+            score=0,
+        )
+
+        x_squared_norms = row_norms(X, squared=True)
+
+        for _ in range(self.n_clusters - 1):
+            # Chose cluster to bisect
+            cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
+
+            # Split this cluster into 2 subclusters
+            self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect)
+
+        # Aggregate final labels and centers from the bisecting tree
+        self.labels_ = np.full(X.shape[0], -1, dtype=np.int32)
+        self.cluster_centers_ = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
+
+        for i, cluster_node in enumerate(self._bisecting_tree.iter_leaves()):
+            self.labels_[cluster_node.indices] = i
+            self.cluster_centers_[i] = cluster_node.center
+            cluster_node.label = i  # label final clusters for future prediction
+            cluster_node.indices = None  # release memory
+
+        # Restore original data
+        if not sp.issparse(X):
+            X += self._X_mean
+            self.cluster_centers_ += self._X_mean
+
+        _inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
+        self.inertia_ = _inertia(
+            X, sample_weight, self.cluster_centers_, self.labels_, self._n_threads
+        )
+
+        self._n_features_out = self.cluster_centers_.shape[0]
+
+        return self
+
+    def predict(self, X):
+        """Predict which cluster each sample in X belongs to.
+
+        Prediction is made by going down the hierarchical tree
+        in searching of closest leaf cluster.
+
+        In the vector quantization literature, `cluster_centers_` is called
+        the code book and each value returned by `predict` is the index of
+        the closest code in the code book.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data to predict.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+        x_squared_norms = row_norms(X, squared=True)
+
+        # sample weights are unused but necessary in cython helpers
+        sample_weight = np.ones_like(x_squared_norms)
+
+        labels = self._predict_recursive(X, sample_weight, self._bisecting_tree)
+
+        return labels
+
+    def _predict_recursive(self, X, sample_weight, cluster_node):
+        """Predict recursively by going down the hierarchical tree.
+
+        Parameters
+        ----------
+        X : {ndarray, csr_matrix} of shape (n_samples, n_features)
+            The data points, currently assigned to `cluster_node`, to predict between
+            the subclusters of this node.
+
+        sample_weight : ndarray of shape (n_samples,)
+            The weights for each observation in X.
+
+        cluster_node : _BisectingTree node object
+            The cluster node of the hierarchical tree.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        if cluster_node.left is None:
+            # This cluster has no subcluster. Labels are just the label of the cluster.
+            return np.full(X.shape[0], cluster_node.label, dtype=np.int32)
+
+        # Determine if data points belong to the left or right subcluster
+        centers = np.vstack((cluster_node.left.center, cluster_node.right.center))
+        if hasattr(self, "_X_mean"):
+            centers += self._X_mean
+
+        cluster_labels = _labels_inertia_threadpool_limit(
+            X,
+            sample_weight,
+            centers,
+            self._n_threads,
+            return_inertia=False,
+        )
+        mask = cluster_labels == 0
+
+        # Compute the labels for each subset of the data points.
+        labels = np.full(X.shape[0], -1, dtype=np.int32)
+
+        labels[mask] = self._predict_recursive(
+            X[mask], sample_weight[mask], cluster_node.left
+        )
+
+        labels[~mask] = self._predict_recursive(
+            X[~mask], sample_weight[~mask], cluster_node.right
+        )
+
+        return labels
+
+    def _more_tags(self):
+        return {"preserves_dtype": [np.float64, np.float32]}
@@ -0,0 +1,478 @@
+"""
+DBSCAN: Density-Based Spatial Clustering of Applications with Noise
+"""
+
+# Author: Robert Layton <robertlayton@gmail.com>
+#         Joel Nothman <joel.nothman@gmail.com>
+#         Lars Buitinck
+#
+# License: BSD 3 clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import sparse
+
+from ..base import BaseEstimator, ClusterMixin, _fit_context
+from ..metrics.pairwise import _VALID_METRICS
+from ..neighbors import NearestNeighbors
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.validation import _check_sample_weight
+from ._dbscan_inner import dbscan_inner
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "sample_weight": ["array-like", None],
+    },
+    prefer_skip_nested_validation=False,
+)
+def dbscan(
+    X,
+    eps=0.5,
+    *,
+    min_samples=5,
+    metric="minkowski",
+    metric_params=None,
+    algorithm="auto",
+    leaf_size=30,
+    p=2,
+    sample_weight=None,
+    n_jobs=None,
+):
+    """Perform DBSCAN clustering from vector array or distance matrix.
+
+    Read more in the :ref:`User Guide <dbscan>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
+            (n_samples, n_samples)
+        A feature array, or array of distances between samples if
+        ``metric='precomputed'``.
+
+    eps : float, default=0.5
+        The maximum distance between two samples for one to be considered
+        as in the neighborhood of the other. This is not a maximum bound
+        on the distances of points within a cluster. This is the most
+        important DBSCAN parameter to choose appropriately for your data set
+        and distance function.
+
+    min_samples : int, default=5
+        The number of samples (or total weight) in a neighborhood for a point
+        to be considered as a core point. This includes the point itself.
+
+    metric : str or callable, default='minkowski'
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string or callable, it must be one of
+        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
+        its metric parameter.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square during fit.
+        X may be a :term:`sparse graph <sparse graph>`,
+        in which case only "nonzero" elements may be considered neighbors.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+        .. versionadded:: 0.19
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        The algorithm to be used by the NearestNeighbors module
+        to compute pointwise distances and find nearest neighbors.
+        See NearestNeighbors module documentation for details.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or cKDTree. This can affect the speed
+        of the construction and query, as well as the memory required
+        to store the tree. The optimal value depends
+        on the nature of the problem.
+
+    p : float, default=2
+        The power of the Minkowski metric to be used to calculate distance
+        between points.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Weight of each sample, such that a sample with a weight of at least
+        ``min_samples`` is by itself a core sample; a sample with negative
+        weight may inhibit its eps-neighbor from being core.
+        Note that weights are absolute, and default to 1.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search. ``None`` means
+        1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
+        using all processors. See :term:`Glossary <n_jobs>` for more details.
+        If precomputed distance are used, parallel execution is not available
+        and thus n_jobs will have no effect.
+
+    Returns
+    -------
+    core_samples : ndarray of shape (n_core_samples,)
+        Indices of core samples.
+
+    labels : ndarray of shape (n_samples,)
+        Cluster labels for each point.  Noisy samples are given the label -1.
+
+    See Also
+    --------
+    DBSCAN : An estimator interface for this clustering algorithm.
+    OPTICS : A similar estimator interface clustering at multiple values of
+        eps. Our implementation is optimized for memory usage.
+
+    Notes
+    -----
+    For an example, see :ref:`examples/cluster/plot_dbscan.py
+    <sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
+
+    This implementation bulk-computes all neighborhood queries, which increases
+    the memory complexity to O(n.d) where d is the average number of neighbors,
+    while original DBSCAN had memory complexity O(n). It may attract a higher
+    memory complexity when querying these nearest neighborhoods, depending
+    on the ``algorithm``.
+
+    One way to avoid the query complexity is to pre-compute sparse
+    neighborhoods in chunks using
+    :func:`NearestNeighbors.radius_neighbors_graph
+    <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
+    ``mode='distance'``, then using ``metric='precomputed'`` here.
+
+    Another way to reduce memory and computation time is to remove
+    (near-)duplicate points and use ``sample_weight`` instead.
+
+    :class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower
+    memory usage.
+
+    References
+    ----------
+    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
+    Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
+    <https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
+    In: Proceedings of the 2nd International Conference on Knowledge Discovery
+    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
+
+    Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
+    :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
+    <10.1145/3068335>`
+    ACM Transactions on Database Systems (TODS), 42(3), 19.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import dbscan
+    >>> X = [[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]
+    >>> core_samples, labels = dbscan(X, eps=3, min_samples=2)
+    >>> core_samples
+    array([0, 1, 2, 3, 4])
+    >>> labels
+    array([ 0,  0,  0,  1,  1, -1])
+    """
+
+    est = DBSCAN(
+        eps=eps,
+        min_samples=min_samples,
+        metric=metric,
+        metric_params=metric_params,
+        algorithm=algorithm,
+        leaf_size=leaf_size,
+        p=p,
+        n_jobs=n_jobs,
+    )
+    est.fit(X, sample_weight=sample_weight)
+    return est.core_sample_indices_, est.labels_
+
+
+class DBSCAN(ClusterMixin, BaseEstimator):
+    """Perform DBSCAN clustering from vector array or distance matrix.
+
+    DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
+    Finds core samples of high density and expands clusters from them.
+    Good for data which contains clusters of similar density.
+
+    This implementation has a worst case memory complexity of :math:`O({n}^2)`,
+    which can occur when the `eps` param is large and `min_samples` is low,
+    while the original DBSCAN only uses linear memory.
+    For further details, see the Notes below.
+
+    Read more in the :ref:`User Guide <dbscan>`.
+
+    Parameters
+    ----------
+    eps : float, default=0.5
+        The maximum distance between two samples for one to be considered
+        as in the neighborhood of the other. This is not a maximum bound
+        on the distances of points within a cluster. This is the most
+        important DBSCAN parameter to choose appropriately for your data set
+        and distance function.
+
+    min_samples : int, default=5
+        The number of samples (or total weight) in a neighborhood for a point to
+        be considered as a core point. This includes the point itself. If
+        `min_samples` is set to a higher value, DBSCAN will find denser clusters,
+        whereas if it is set to a lower value, the found clusters will be more
+        sparse.
+
+    metric : str, or callable, default='euclidean'
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string or callable, it must be one of
+        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
+        its metric parameter.
+        If metric is "precomputed", X is assumed to be a distance matrix and
+        must be square. X may be a :term:`sparse graph`, in which
+        case only "nonzero" elements may be considered neighbors for DBSCAN.
+
+        .. versionadded:: 0.17
+           metric *precomputed* to accept precomputed sparse matrix.
+
+    metric_params : dict, default=None
+        Additional keyword arguments for the metric function.
+
+        .. versionadded:: 0.19
+
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+        The algorithm to be used by the NearestNeighbors module
+        to compute pointwise distances and find nearest neighbors.
+        See NearestNeighbors module documentation for details.
+
+    leaf_size : int, default=30
+        Leaf size passed to BallTree or cKDTree. This can affect the speed
+        of the construction and query, as well as the memory required
+        to store the tree. The optimal value depends
+        on the nature of the problem.
+
+    p : float, default=None
+        The power of the Minkowski metric to be used to calculate distance
+        between points. If None, then ``p=2`` (equivalent to the Euclidean
+        distance).
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    core_sample_indices_ : ndarray of shape (n_core_samples,)
+        Indices of core samples.
+
+    components_ : ndarray of shape (n_core_samples, n_features)
+        Copy of each core sample found by training.
+
+    labels_ : ndarray of shape (n_samples)
+        Cluster labels for each point in the dataset given to fit().
+        Noisy samples are given the label -1.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    OPTICS : A similar clustering at multiple values of eps. Our implementation
+        is optimized for memory usage.
+
+    Notes
+    -----
+    For an example, see :ref:`examples/cluster/plot_dbscan.py
+    <sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
+
+    This implementation bulk-computes all neighborhood queries, which increases
+    the memory complexity to O(n.d) where d is the average number of neighbors,
+    while original DBSCAN had memory complexity O(n). It may attract a higher
+    memory complexity when querying these nearest neighborhoods, depending
+    on the ``algorithm``.
+
+    One way to avoid the query complexity is to pre-compute sparse
+    neighborhoods in chunks using
+    :func:`NearestNeighbors.radius_neighbors_graph
+    <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
+    ``mode='distance'``, then using ``metric='precomputed'`` here.
+
+    Another way to reduce memory and computation time is to remove
+    (near-)duplicate points and use ``sample_weight`` instead.
+
+    :class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory
+    usage.
+
+    References
+    ----------
+    Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
+    Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
+    <https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
+    In: Proceedings of the 2nd International Conference on Knowledge Discovery
+    and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
+
+    Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
+    :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
+    <10.1145/3068335>`
+    ACM Transactions on Database Systems (TODS), 42(3), 19.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import DBSCAN
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [2, 2], [2, 3],
+    ...               [8, 7], [8, 8], [25, 80]])
+    >>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
+    >>> clustering.labels_
+    array([ 0,  0,  0,  1,  1, -1])
+    >>> clustering
+    DBSCAN(eps=3, min_samples=2)
+    """
+
+    _parameter_constraints: dict = {
+        "eps": [Interval(Real, 0.0, None, closed="neither")],
+        "min_samples": [Interval(Integral, 1, None, closed="left")],
+        "metric": [
+            StrOptions(set(_VALID_METRICS) | {"precomputed"}),
+            callable,
+        ],
+        "metric_params": [dict, None],
+        "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
+        "leaf_size": [Interval(Integral, 1, None, closed="left")],
+        "p": [Interval(Real, 0.0, None, closed="left"), None],
+        "n_jobs": [Integral, None],
+    }
+
+    def __init__(
+        self,
+        eps=0.5,
+        *,
+        min_samples=5,
+        metric="euclidean",
+        metric_params=None,
+        algorithm="auto",
+        leaf_size=30,
+        p=None,
+        n_jobs=None,
+    ):
+        self.eps = eps
+        self.min_samples = min_samples
+        self.metric = metric
+        self.metric_params = metric_params
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.p = p
+        self.n_jobs = n_jobs
+
+    @_fit_context(
+        # DBSCAN.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None, sample_weight=None):
+        """Perform DBSCAN clustering from features, or distance matrix.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+            (n_samples, n_samples)
+            Training instances to cluster, or distances between instances if
+            ``metric='precomputed'``. If a sparse matrix is provided, it will
+            be converted into a sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Weight of each sample, such that a sample with a weight of at least
+            ``min_samples`` is by itself a core sample; a sample with a
+            negative weight may inhibit its eps-neighbor from being core.
+            Note that weights are absolute, and default to 1.
+
+        Returns
+        -------
+        self : object
+            Returns a fitted instance of self.
+        """
+        X = self._validate_data(X, accept_sparse="csr")
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        # Calculate neighborhood for all samples. This leaves the original
+        # point in, which needs to be considered later (i.e. point i is in the
+        # neighborhood of point i. While True, its useless information)
+        if self.metric == "precomputed" and sparse.issparse(X):
+            # set the diagonal to explicit values, as a point is its own
+            # neighbor
+            X = X.copy()  # copy to avoid in-place modification
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
+                X.setdiag(X.diagonal())
+
+        neighbors_model = NearestNeighbors(
+            radius=self.eps,
+            algorithm=self.algorithm,
+            leaf_size=self.leaf_size,
+            metric=self.metric,
+            metric_params=self.metric_params,
+            p=self.p,
+            n_jobs=self.n_jobs,
+        )
+        neighbors_model.fit(X)
+        # This has worst case O(n^2) memory complexity
+        neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)
+
+        if sample_weight is None:
+            n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
+        else:
+            n_neighbors = np.array(
+                [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]
+            )
+
+        # Initially, all samples are noise.
+        labels = np.full(X.shape[0], -1, dtype=np.intp)
+
+        # A list of all core samples found.
+        core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)
+        dbscan_inner(core_samples, neighborhoods, labels)
+
+        self.core_sample_indices_ = np.where(core_samples)[0]
+        self.labels_ = labels
+
+        if len(self.core_sample_indices_):
+            # fix for scipy sparse indexing issue
+            self.components_ = X[self.core_sample_indices_].copy()
+        else:
+            # no core samples
+            self.components_ = np.empty((0, X.shape[1]))
+        return self
+
+    def fit_predict(self, X, y=None, sample_weight=None):
+        """Compute clusters from a data or distance matrix and predict labels.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
+            (n_samples, n_samples)
+            Training instances to cluster, or distances between instances if
+            ``metric='precomputed'``. If a sparse matrix is provided, it will
+            be converted into a sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Weight of each sample, such that a sample with a weight of at least
+            ``min_samples`` is by itself a core sample; a sample with a
+            negative weight may inhibit its eps-neighbor from being core.
+            Note that weights are absolute, and default to 1.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels. Noisy samples are given the label -1.
+        """
+        self.fit(X, sample_weight=sample_weight)
+        return self.labels_
+
+    def _more_tags(self):
+        return {"pairwise": self.metric == "precomputed"}
@@ -0,0 +1,40 @@
+# Fast inner loop for DBSCAN.
+# Author: Lars Buitinck
+# License: 3-clause BSD
+
+from libcpp.vector cimport vector
+
+from ..utils._typedefs cimport uint8_t, intp_t
+
+
+def dbscan_inner(const uint8_t[::1] is_core,
+                 object[:] neighborhoods,
+                 intp_t[::1] labels):
+    cdef intp_t i, label_num = 0, v
+    cdef intp_t[:] neighb
+    cdef vector[intp_t] stack
+
+    for i in range(labels.shape[0]):
+        if labels[i] != -1 or not is_core[i]:
+            continue
+
+        # Depth-first search starting from i, ending at the non-core points.
+        # This is very similar to the classic algorithm for computing connected
+        # components, the difference being that we label non-core points as
+        # part of a cluster (component), but don't expand their neighborhoods.
+        while True:
+            if labels[i] == -1:
+                labels[i] = label_num
+                if is_core[i]:
+                    neighb = neighborhoods[i]
+                    for i in range(neighb.shape[0]):
+                        v = neighb[i]
+                        if labels[v] == -1:
+                            stack.push_back(v)
+
+            if stack.size() == 0:
+                break
+            i = stack.back()
+            stack.pop_back()
+
+        label_num += 1
@@ -0,0 +1,92 @@
+"""
+Feature agglomeration. Base classes and functions for performing feature
+agglomeration.
+"""
+
+# Author: V. Michel, A. Gramfort
+# License: BSD 3 clause
+
+
+import numpy as np
+from scipy.sparse import issparse
+
+from ..base import TransformerMixin
+from ..utils import metadata_routing
+from ..utils.deprecation import _deprecate_Xt_in_inverse_transform
+from ..utils.validation import check_is_fitted
+
+###############################################################################
+# Mixin class for feature agglomeration.
+
+
+class AgglomerationTransform(TransformerMixin):
+    """
+    A class for feature agglomeration via the transform interface.
+    """
+
+    # This prevents ``set_split_inverse_transform`` to be generated for the
+    # non-standard ``Xt`` arg on ``inverse_transform``.
+    # TODO(1.7): remove when Xt is removed for inverse_transform.
+    __metadata_request__inverse_transform = {"Xt": metadata_routing.UNUSED}
+
+    def transform(self, X):
+        """
+        Transform a new matrix using the built clustering.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            A M by N array of M observations in N dimensions or a length
+            M array of M one-dimensional observations.
+
+        Returns
+        -------
+        Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,)
+            The pooled values for each feature cluster.
+        """
+        check_is_fitted(self)
+
+        X = self._validate_data(X, reset=False)
+        if self.pooling_func == np.mean and not issparse(X):
+            size = np.bincount(self.labels_)
+            n_samples = X.shape[0]
+            # a fast way to compute the mean of grouped features
+            nX = np.array(
+                [np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)]
+            )
+        else:
+            nX = [
+                self.pooling_func(X[:, self.labels_ == l], axis=1)
+                for l in np.unique(self.labels_)
+            ]
+            nX = np.array(nX).T
+        return nX
+
+    def inverse_transform(self, X=None, *, Xt=None):
+        """
+        Inverse the transformation and return a vector of size `n_features`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_clusters) or (n_clusters,)
+            The values to be assigned to each cluster of samples.
+
+        Xt : array-like of shape (n_samples, n_clusters) or (n_clusters,)
+            The values to be assigned to each cluster of samples.
+
+            .. deprecated:: 1.5
+                `Xt` was deprecated in 1.5 and will be removed in 1.7. Use `X` instead.
+
+        Returns
+        -------
+        X : ndarray of shape (n_samples, n_features) or (n_features,)
+            A vector of size `n_samples` with the values of `Xred` assigned to
+            each of the cluster of samples.
+        """
+        X = _deprecate_Xt_in_inverse_transform(X, Xt)
+
+        check_is_fitted(self)
+
+        unil, inverse = np.unique(self.labels_, return_inverse=True)
+        return X[..., inverse]
@@ -0,0 +1,272 @@
+# Minimum spanning tree single linkage implementation for hdbscan
+# Authors: Leland McInnes <leland.mcinnes@gmail.com>
+#          Steve Astels <sastels@gmail.com>
+#          Meekail Zain <zainmeekail@gmail.com>
+# Copyright (c) 2015, Leland McInnes
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+cimport numpy as cnp
+from libc.float cimport DBL_MAX
+
+import numpy as np
+from ...metrics._dist_metrics cimport DistanceMetric64
+from ...cluster._hierarchical_fast cimport UnionFind
+from ...cluster._hdbscan._tree cimport HIERARCHY_t
+from ...cluster._hdbscan._tree import HIERARCHY_dtype
+from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t
+
+cnp.import_array()
+
+cdef extern from "numpy/arrayobject.h":
+    intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
+
+# Numpy structured dtype representing a single ordered edge in Prim's algorithm
+MST_edge_dtype = np.dtype([
+    ("current_node", np.int64),
+    ("next_node", np.int64),
+    ("distance", np.float64),
+])
+
+# Packed shouldn't make a difference since they're all 8-byte quantities,
+# but it's included just to be safe.
+ctypedef packed struct MST_edge_t:
+    int64_t current_node
+    int64_t next_node
+    float64_t distance
+
+cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
+    cnp.ndarray[float64_t, ndim=2] mutual_reachability
+):
+    """Compute the Minimum Spanning Tree (MST) representation of the mutual-
+    reachability graph using Prim's algorithm.
+
+    Parameters
+    ----------
+    mutual_reachability : ndarray of shape (n_samples, n_samples)
+        Array of mutual-reachabilities between samples.
+
+    Returns
+    -------
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
+        The MST representation of the mutual-reahability graph. The MST is
+        represented as a collecteion of edges.
+    """
+    cdef:
+        # Note: we utilize ndarray's over memory-views to make use of numpy
+        # binary indexing and sub-selection below.
+        cnp.ndarray[int64_t, ndim=1, mode='c'] current_labels
+        cnp.ndarray[float64_t, ndim=1, mode='c'] min_reachability, left, right
+        cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
+
+        cnp.ndarray[uint8_t, mode='c'] label_filter
+
+        int64_t n_samples = PyArray_SHAPE(<cnp.PyArrayObject*> mutual_reachability)[0]
+        int64_t current_node, new_node_index, new_node, i
+
+    mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
+    current_labels = np.arange(n_samples, dtype=np.int64)
+    current_node = 0
+    min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
+    for i in range(0, n_samples - 1):
+        label_filter = current_labels != current_node
+        current_labels = current_labels[label_filter]
+        left = min_reachability[label_filter]
+        right = mutual_reachability[current_node][current_labels]
+        min_reachability = np.minimum(left, right)
+
+        new_node_index = np.argmin(min_reachability)
+        new_node = current_labels[new_node_index]
+        mst[i].current_node = current_node
+        mst[i].next_node = new_node
+        mst[i].distance = min_reachability[new_node_index]
+        current_node = new_node
+
+    return mst
+
+
+cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
+    const float64_t[:, ::1] raw_data,
+    const float64_t[::1] core_distances,
+    DistanceMetric64 dist_metric,
+    float64_t alpha=1.0
+):
+    """Compute the Minimum Spanning Tree (MST) representation of the mutual-
+    reachability graph generated from the provided `raw_data` and
+    `core_distances` using Prim's algorithm.
+
+    Parameters
+    ----------
+    raw_data : ndarray of shape (n_samples, n_features)
+        Input array of data samples.
+
+    core_distances : ndarray of shape (n_samples,)
+        An array containing the core-distance calculated for each corresponding
+        sample.
+
+    dist_metric : DistanceMetric
+        The distance metric to use when calculating pairwise distances for
+        determining mutual-reachability.
+
+    Returns
+    -------
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
+        The MST representation of the mutual-reahability graph. The MST is
+        represented as a collecteion of edges.
+    """
+
+    cdef:
+        uint8_t[::1] in_tree
+        float64_t[::1] min_reachability
+        int64_t[::1] current_sources
+        cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
+
+        int64_t current_node, source_node, new_node, next_node_source
+        int64_t i, j, n_samples, num_features
+
+        float64_t current_node_core_dist, new_reachability, mutual_reachability_distance
+        float64_t next_node_min_reach, pair_distance, next_node_core_dist
+
+    n_samples = raw_data.shape[0]
+    num_features = raw_data.shape[1]
+
+    mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
+
+    in_tree = np.zeros(n_samples, dtype=np.uint8)
+    min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
+    current_sources = np.ones(n_samples, dtype=np.int64)
+
+    current_node = 0
+
+    for i in range(0, n_samples - 1):
+
+        in_tree[current_node] = 1
+
+        current_node_core_dist = core_distances[current_node]
+
+        new_reachability = DBL_MAX
+        source_node = 0
+        new_node = 0
+
+        for j in range(n_samples):
+            if in_tree[j]:
+                continue
+
+            next_node_min_reach = min_reachability[j]
+            next_node_source = current_sources[j]
+
+            pair_distance = dist_metric.dist(
+                &raw_data[current_node, 0],
+                &raw_data[j, 0],
+                num_features
+            )
+
+            pair_distance /= alpha
+
+            next_node_core_dist = core_distances[j]
+            mutual_reachability_distance = max(
+                current_node_core_dist,
+                next_node_core_dist,
+                pair_distance
+            )
+            if mutual_reachability_distance > next_node_min_reach:
+                if next_node_min_reach < new_reachability:
+                    new_reachability = next_node_min_reach
+                    source_node = next_node_source
+                    new_node = j
+                continue
+
+            if mutual_reachability_distance < next_node_min_reach:
+                min_reachability[j] = mutual_reachability_distance
+                current_sources[j] = current_node
+                if mutual_reachability_distance < new_reachability:
+                    new_reachability = mutual_reachability_distance
+                    source_node = current_node
+                    new_node = j
+            else:
+                if next_node_min_reach < new_reachability:
+                    new_reachability = next_node_min_reach
+                    source_node = next_node_source
+                    new_node = j
+
+        mst[i].current_node = source_node
+        mst[i].next_node = new_node
+        mst[i].distance = new_reachability
+        current_node = new_node
+
+    return mst
+
+cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst):
+    """Construct a single-linkage tree from an MST.
+
+    Parameters
+    ----------
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
+        The MST representation of the mutual-reahability graph. The MST is
+        represented as a collecteion of edges.
+
+    Returns
+    -------
+    single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
+        The single-linkage tree tree (dendrogram) built from the MST. Each
+        of the array represents the following:
+
+        - left node/cluster
+        - right node/cluster
+        - distance
+        - new cluster size
+    """
+    cdef:
+        cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage
+
+        # Note mst.shape[0] is one fewer than the number of samples
+        int64_t n_samples = mst.shape[0] + 1
+        intp_t current_node_cluster, next_node_cluster
+        int64_t current_node, next_node, i
+        float64_t distance
+        UnionFind U = UnionFind(n_samples)
+
+    single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype)
+
+    for i in range(n_samples - 1):
+
+        current_node = mst[i].current_node
+        next_node = mst[i].next_node
+        distance = mst[i].distance
+
+        current_node_cluster = U.fast_find(current_node)
+        next_node_cluster = U.fast_find(next_node)
+
+        single_linkage[i].left_node = current_node_cluster
+        single_linkage[i].right_node = next_node_cluster
+        single_linkage[i].value = distance
+        single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster]
+
+        U.union(current_node_cluster, next_node_cluster)
+
+    return single_linkage
@@ -0,0 +1,212 @@
+# mutual reachability distance computations
+# Authors: Leland McInnes <leland.mcinnes@gmail.com>
+#          Meekail Zain <zainmeekail@gmail.com>
+#          Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# Copyright (c) 2015, Leland McInnes
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+cimport numpy as cnp
+
+import numpy as np
+from scipy.sparse import issparse
+from cython cimport floating, integral
+from libc.math cimport isfinite, INFINITY
+from ...utils._typedefs cimport intp_t
+cnp.import_array()
+
+
+def mutual_reachability_graph(
+    distance_matrix, min_samples=5, max_distance=0.0
+):
+    """Compute the weighted adjacency matrix of the mutual reachability graph.
+
+    The mutual reachability distance used to build the graph is defined as::
+
+        max(d_core(x_p), d_core(x_q), d(x_p, x_q))
+
+    and the core distance `d_core` is defined as the distance between a point
+    `x_p` and its k-th nearest neighbor.
+
+    Note that all computations are done in-place.
+
+    Parameters
+    ----------
+    distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
+        Array of distances between samples. If sparse, the array must be in
+        `CSR` format.
+
+    min_samples : int, default=5
+        The number of points in a neighbourhood for a point to be considered
+        a core point.
+
+    max_distance : float, default=0.0
+        The distance which `np.inf` is replaced with. When the true mutual-
+        reachability distance is measured to be infinite, it is instead
+        truncated to `max_dist`. Only used when `distance_matrix` is a sparse
+        matrix.
+
+    Returns
+    -------
+    mututal_reachability_graph: {ndarray, sparse matrix} of shape \
+            (n_samples, n_samples)
+        Weighted adjacency matrix of the mutual reachability graph.
+
+    References
+    ----------
+    .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
+       Density-based clustering based on hierarchical density estimates.
+       In Pacific-Asia Conference on Knowledge Discovery and Data Mining
+       (pp. 160-172). Springer Berlin Heidelberg.
+    """
+    further_neighbor_idx = min_samples - 1
+    if issparse(distance_matrix):
+        if distance_matrix.format != "csr":
+            raise ValueError(
+                "Only sparse CSR matrices are supported for `distance_matrix`."
+            )
+        _sparse_mutual_reachability_graph(
+            distance_matrix.data,
+            distance_matrix.indices,
+            distance_matrix.indptr,
+            distance_matrix.shape[0],
+            further_neighbor_idx=further_neighbor_idx,
+            max_distance=max_distance,
+        )
+    else:
+        _dense_mutual_reachability_graph(
+            distance_matrix, further_neighbor_idx=further_neighbor_idx
+        )
+    return distance_matrix
+
+
+def _dense_mutual_reachability_graph(
+    floating[:, :] distance_matrix,
+    intp_t further_neighbor_idx,
+):
+    """Dense implementation of mutual reachability graph.
+
+    The computation is done in-place, i.e. the distance matrix is modified
+    directly.
+
+    Parameters
+    ----------
+    distance_matrix : ndarray of shape (n_samples, n_samples)
+        Array of distances between samples.
+
+    further_neighbor_idx : int
+        The index of the furthest neighbor to use to define the core distances.
+    """
+    cdef:
+        intp_t i, j, n_samples = distance_matrix.shape[0]
+        floating mutual_reachibility_distance
+        floating[::1] core_distances
+
+    # We assume that the distance matrix is symmetric. We choose to sort every
+    # row to have the same implementation than the sparse case that requires
+    # CSR matrix.
+    core_distances = np.ascontiguousarray(
+        np.partition(
+            distance_matrix, further_neighbor_idx, axis=1
+        )[:, further_neighbor_idx]
+    )
+
+    with nogil:
+        # TODO: Update w/ prange with thread count based on
+        # _openmp_effective_n_threads
+        for i in range(n_samples):
+            for j in range(n_samples):
+                mutual_reachibility_distance = max(
+                    core_distances[i],
+                    core_distances[j],
+                    distance_matrix[i, j],
+                )
+                distance_matrix[i, j] = mutual_reachibility_distance
+
+
+def _sparse_mutual_reachability_graph(
+    cnp.ndarray[floating, ndim=1, mode="c"] data,
+    cnp.ndarray[integral, ndim=1, mode="c"] indices,
+    cnp.ndarray[integral, ndim=1, mode="c"] indptr,
+    intp_t n_samples,
+    intp_t further_neighbor_idx,
+    floating max_distance,
+):
+    """Sparse implementation of mutual reachability graph.
+
+    The computation is done in-place, i.e. the distance matrix is modified
+    directly. This implementation only accepts `CSR` format sparse matrices.
+
+    Parameters
+    ----------
+    distance_matrix : sparse matrix of shape (n_samples, n_samples)
+        Sparse matrix of distances between samples. The sparse format should
+        be `CSR`.
+
+    further_neighbor_idx : int
+        The index of the furthest neighbor to use to define the core distances.
+
+    max_distance : float
+        The distance which `np.inf` is replaced with. When the true mutual-
+        reachability distance is measured to be infinite, it is instead
+        truncated to `max_dist`. Only used when `distance_matrix` is a sparse
+        matrix.
+    """
+    cdef:
+        integral i, col_ind, row_ind
+        floating mutual_reachibility_distance
+        floating[:] core_distances
+        floating[:] row_data
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    core_distances = np.empty(n_samples, dtype=dtype)
+
+    for i in range(n_samples):
+        row_data = data[indptr[i]:indptr[i + 1]]
+        if further_neighbor_idx < row_data.size:
+            core_distances[i] = np.partition(
+                row_data, further_neighbor_idx
+            )[further_neighbor_idx]
+        else:
+            core_distances[i] = INFINITY
+
+    with nogil:
+        for row_ind in range(n_samples):
+            for i in range(indptr[row_ind], indptr[row_ind + 1]):
+                col_ind = indices[i]
+                mutual_reachibility_distance = max(
+                    core_distances[row_ind], core_distances[col_ind], data[i]
+                )
+                if isfinite(mutual_reachibility_distance):
+                    data[i] = mutual_reachibility_distance
+                elif max_distance > 0:
+                    data[i] = max_distance
@@ -0,0 +1,49 @@
+# Copyright (c) 2015, Leland McInnes
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from ...utils._typedefs cimport intp_t, float64_t, uint8_t
+cimport numpy as cnp
+
+# This corresponds to the scipy.cluster.hierarchy format
+ctypedef packed struct HIERARCHY_t:
+    intp_t left_node
+    intp_t right_node
+    float64_t value
+    intp_t cluster_size
+
+# Effectively an edgelist encoding a parent/child pair, along with a value and
+# the corresponding cluster_size in each row providing a tree structure.
+ctypedef packed struct CONDENSED_t:
+    intp_t parent
+    intp_t child
+    float64_t value
+    intp_t cluster_size
+
+cdef extern from "numpy/arrayobject.h":
+    intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
@@ -0,0 +1,799 @@
+# Tree handling (condensing, finding stable clusters) for hdbscan
+# Authors: Leland McInnes
+# Copyright (c) 2015, Leland McInnes
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+cimport numpy as cnp
+from libc.math cimport isinf
+import cython
+
+import numpy as np
+
+cnp.import_array()
+
+cdef extern from "numpy/arrayobject.h":
+    intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
+
+cdef cnp.float64_t INFTY = np.inf
+cdef cnp.intp_t NOISE = -1
+
+HIERARCHY_dtype = np.dtype([
+    ("left_node", np.intp),
+    ("right_node", np.intp),
+    ("value", np.float64),
+    ("cluster_size", np.intp),
+])
+
+CONDENSED_dtype = np.dtype([
+    ("parent", np.intp),
+    ("child", np.intp),
+    ("value", np.float64),
+    ("cluster_size", np.intp),
+])
+
+cpdef tuple tree_to_labels(
+    const HIERARCHY_t[::1] single_linkage_tree,
+    cnp.intp_t min_cluster_size=10,
+    cluster_selection_method="eom",
+    bint allow_single_cluster=False,
+    cnp.float64_t cluster_selection_epsilon=0.0,
+    max_cluster_size=None,
+):
+    cdef:
+        cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
+        cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities
+
+    condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size)
+    labels, probabilities = _get_clusters(
+        condensed_tree,
+        _compute_stability(condensed_tree),
+        cluster_selection_method,
+        allow_single_cluster,
+        cluster_selection_epsilon,
+        max_cluster_size,
+    )
+
+    return (labels, probabilities)
+
+cdef list bfs_from_hierarchy(
+    const HIERARCHY_t[::1] hierarchy,
+    cnp.intp_t bfs_root
+):
+    """
+    Perform a breadth first search on a tree in scipy hclust format.
+    """
+
+    cdef list process_queue, next_queue, result
+    cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1
+    cdef cnp.intp_t node
+    process_queue = [bfs_root]
+    result = []
+
+    while process_queue:
+        result.extend(process_queue)
+        # By construction, node i is formed by the union of nodes
+        # hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1]
+        process_queue = [
+            x - n_samples
+            for x in process_queue
+            if x >= n_samples
+        ]
+        if process_queue:
+            next_queue = []
+            for node in process_queue:
+                next_queue.extend(
+                    [
+                        hierarchy[node].left_node,
+                        hierarchy[node].right_node,
+                    ]
+                )
+            process_queue = next_queue
+    return result
+
+
+cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
+    const HIERARCHY_t[::1] hierarchy,
+    cnp.intp_t min_cluster_size=10
+):
+    """Condense a tree according to a minimum cluster size. This is akin
+    to the runt pruning procedure of Stuetzle. The result is a much simpler
+    tree that is easier to visualize. We include extra information on the
+    lambda value at which individual points depart clusters for later
+    analysis and computation.
+
+    Parameters
+    ----------
+    hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
+        A single linkage hierarchy in scipy.cluster.hierarchy format.
+
+    min_cluster_size : int, optional (default 10)
+        The minimum size of clusters to consider. Clusters smaller than this
+        are pruned from the tree.
+
+    Returns
+    -------
+    condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
+        Effectively an edgelist encoding a parent/child pair, along with a
+        value and the corresponding cluster_size in each row providing a tree
+        structure.
+    """
+
+    cdef:
+        cnp.intp_t root = 2 * hierarchy.shape[0]
+        cnp.intp_t n_samples = hierarchy.shape[0] + 1
+        cnp.intp_t next_label = n_samples + 1
+        list result_list, node_list = bfs_from_hierarchy(hierarchy, root)
+
+        cnp.intp_t[::1] relabel
+        cnp.uint8_t[::1] ignore
+
+        cnp.intp_t node, sub_node, left, right
+        cnp.float64_t lambda_value, distance
+        cnp.intp_t left_count, right_count
+        HIERARCHY_t children
+
+    relabel = np.empty(root + 1, dtype=np.intp)
+    relabel[root] = n_samples
+    result_list = []
+    ignore = np.zeros(len(node_list), dtype=bool)
+
+    for node in node_list:
+        if ignore[node] or node < n_samples:
+            continue
+
+        children = hierarchy[node - n_samples]
+        left = children.left_node
+        right = children.right_node
+        distance = children.value
+        if distance > 0.0:
+            lambda_value = 1.0 / distance
+        else:
+            lambda_value = INFTY
+
+        if left >= n_samples:
+            left_count = hierarchy[left - n_samples].cluster_size
+        else:
+            left_count = 1
+
+        if right >= n_samples:
+            right_count = <cnp.intp_t> hierarchy[right - n_samples].cluster_size
+        else:
+            right_count = 1
+
+        if left_count >= min_cluster_size and right_count >= min_cluster_size:
+            relabel[left] = next_label
+            next_label += 1
+            result_list.append(
+                (relabel[node], relabel[left], lambda_value, left_count)
+            )
+
+            relabel[right] = next_label
+            next_label += 1
+            result_list.append(
+                (relabel[node], relabel[right], lambda_value, right_count)
+            )
+
+        elif left_count < min_cluster_size and right_count < min_cluster_size:
+            for sub_node in bfs_from_hierarchy(hierarchy, left):
+                if sub_node < n_samples:
+                    result_list.append(
+                        (relabel[node], sub_node, lambda_value, 1)
+                    )
+                ignore[sub_node] = True
+
+            for sub_node in bfs_from_hierarchy(hierarchy, right):
+                if sub_node < n_samples:
+                    result_list.append(
+                        (relabel[node], sub_node, lambda_value, 1)
+                    )
+                ignore[sub_node] = True
+
+        elif left_count < min_cluster_size:
+            relabel[right] = relabel[node]
+            for sub_node in bfs_from_hierarchy(hierarchy, left):
+                if sub_node < n_samples:
+                    result_list.append(
+                        (relabel[node], sub_node, lambda_value, 1)
+                    )
+                ignore[sub_node] = True
+
+        else:
+            relabel[left] = relabel[node]
+            for sub_node in bfs_from_hierarchy(hierarchy, right):
+                if sub_node < n_samples:
+                    result_list.append(
+                        (relabel[node], sub_node, lambda_value, 1)
+                    )
+                ignore[sub_node] = True
+
+    return np.array(result_list, dtype=CONDENSED_dtype)
+
+
+cdef dict _compute_stability(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
+):
+
+    cdef:
+        cnp.float64_t[::1] result, births
+        cnp.intp_t[:] parents = condensed_tree['parent']
+
+        cnp.intp_t parent, cluster_size, result_index, idx
+        cnp.float64_t lambda_val
+        CONDENSED_t condensed_node
+        cnp.intp_t largest_child = condensed_tree['child'].max()
+        cnp.intp_t smallest_cluster = np.min(parents)
+        cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
+        dict stability_dict = {}
+
+    largest_child = max(largest_child, smallest_cluster)
+    births = np.full(largest_child + 1, np.nan, dtype=np.float64)
+
+    for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        condensed_node = condensed_tree[idx]
+        births[condensed_node.child] = condensed_node.value
+
+    births[smallest_cluster] = 0.0
+
+    result = np.zeros(num_clusters, dtype=np.float64)
+    for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        condensed_node = condensed_tree[idx]
+        parent = condensed_node.parent
+        lambda_val = condensed_node.value
+        cluster_size = condensed_node.cluster_size
+
+        result_index = parent - smallest_cluster
+        result[result_index] += (lambda_val - births[parent]) * cluster_size
+
+    for idx in range(num_clusters):
+        stability_dict[idx + smallest_cluster] = result[idx]
+
+    return stability_dict
+
+
+cdef list bfs_from_cluster_tree(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
+    cnp.intp_t bfs_root
+):
+
+    cdef:
+        list result = []
+        cnp.ndarray[cnp.intp_t, ndim=1] process_queue = (
+            np.array([bfs_root], dtype=np.intp)
+        )
+        cnp.ndarray[cnp.intp_t, ndim=1] children = condensed_tree['child']
+        cnp.intp_t[:] parents = condensed_tree['parent']
+
+    while len(process_queue) > 0:
+        result.extend(process_queue.tolist())
+        process_queue = children[np.isin(parents, process_queue)]
+
+    return result
+
+
+cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree):
+
+    cdef:
+        cnp.intp_t parent, current_parent, idx
+        cnp.float64_t lambda_val, max_lambda
+        cnp.float64_t[::1] deaths
+        cnp.intp_t largest_parent = condensed_tree['parent'].max()
+
+    deaths = np.zeros(largest_parent + 1, dtype=np.float64)
+    current_parent = condensed_tree[0].parent
+    max_lambda = condensed_tree[0].value
+
+    for idx in range(1, PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        parent = condensed_tree[idx].parent
+        lambda_val = condensed_tree[idx].value
+
+        if parent == current_parent:
+            max_lambda = max(max_lambda, lambda_val)
+        else:
+            deaths[current_parent] = max_lambda
+            current_parent = parent
+            max_lambda = lambda_val
+
+    deaths[current_parent] = max_lambda  # value for last parent
+    return deaths
+
+
+@cython.final
+cdef class TreeUnionFind:
+
+    cdef cnp.intp_t[:, ::1] data
+    cdef cnp.uint8_t[::1] is_component
+
+    def __init__(self, size):
+        cdef cnp.intp_t idx
+        self.data = np.zeros((size, 2), dtype=np.intp)
+        for idx in range(size):
+            self.data[idx, 0] = idx
+        self.is_component = np.ones(size, dtype=np.uint8)
+
+    cdef void union(self, cnp.intp_t x, cnp.intp_t y):
+        cdef cnp.intp_t x_root = self.find(x)
+        cdef cnp.intp_t y_root = self.find(y)
+
+        if self.data[x_root, 1] < self.data[y_root, 1]:
+            self.data[x_root, 0] = y_root
+        elif self.data[x_root, 1] > self.data[y_root, 1]:
+            self.data[y_root, 0] = x_root
+        else:
+            self.data[y_root, 0] = x_root
+            self.data[x_root, 1] += 1
+        return
+
+    cdef cnp.intp_t find(self, cnp.intp_t x):
+        if self.data[x, 0] != x:
+            self.data[x, 0] = self.find(self.data[x, 0])
+            self.is_component[x] = False
+        return self.data[x, 0]
+
+
+cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
+        const HIERARCHY_t[::1] linkage,
+        cnp.float64_t cut,
+        cnp.intp_t min_cluster_size
+):
+    """Given a single linkage tree and a cut value, return the
+    vector of cluster labels at that cut value. This is useful
+    for Robust Single Linkage, and extracting DBSCAN results
+    from a single HDBSCAN run.
+
+    Parameters
+    ----------
+    linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
+        The single linkage tree in scipy.cluster.hierarchy format.
+
+    cut : double
+        The cut value at which to find clusters.
+
+    min_cluster_size : int
+        The minimum cluster size; clusters below this size at
+        the cut will be considered noise.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        The cluster labels for each point in the data set;
+        a label of -1 denotes a noise assignment.
+    """
+
+    cdef:
+        cnp.intp_t n, cluster, root, n_samples, cluster_label
+        cnp.intp_t[::1] unique_labels, cluster_size
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
+        TreeUnionFind union_find
+        dict cluster_label_map
+        HIERARCHY_t node
+
+    root = 2 * linkage.shape[0]
+    n_samples = root // 2 + 1
+    result = np.empty(n_samples, dtype=np.intp)
+    union_find = TreeUnionFind(root + 1)
+
+    cluster = n_samples
+    for node in linkage:
+        if node.value < cut:
+            union_find.union(node.left_node, cluster)
+            union_find.union(node.right_node, cluster)
+        cluster += 1
+
+    cluster_size = np.zeros(cluster, dtype=np.intp)
+    for n in range(n_samples):
+        cluster = union_find.find(n)
+        cluster_size[cluster] += 1
+        result[n] = cluster
+
+    cluster_label_map = {-1: NOISE}
+    cluster_label = 0
+    unique_labels = np.unique(result)
+
+    for cluster in unique_labels:
+        if cluster_size[cluster] < min_cluster_size:
+            cluster_label_map[cluster] = NOISE
+        else:
+            cluster_label_map[cluster] = cluster_label
+            cluster_label += 1
+
+    for n in range(n_samples):
+        result[n] = cluster_label_map[result[n]]
+
+    return result
+
+
+cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
+        cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
+        set clusters,
+        dict cluster_label_map,
+        cnp.intp_t allow_single_cluster,
+        cnp.float64_t cluster_selection_epsilon
+):
+    """Given a condensed tree, clusters and a labeling map for the clusters,
+    return an array containing the labels of each point based on cluster
+    membership. Note that this is where points may be marked as noisy
+    outliers. The determination of some points as noise is in large, single-
+    cluster datasets is controlled by the `allow_single_cluster` and
+    `cluster_selection_epsilon` parameters.
+
+    Parameters
+    ----------
+    condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
+        Effectively an edgelist encoding a parent/child pair, along with a
+        value and the corresponding cluster_size in each row providing a tree
+        structure.
+
+    clusters : set
+        The set of nodes corresponding to identified clusters. These node
+        values should be the same as those present in `condensed_tree`.
+
+    cluster_label_map : dict
+        A mapping from the node values present in `clusters` to the labels
+        which will be returned.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        The cluster labels for each point in the data set;
+        a label of -1 denotes a noise assignment.
+    """
+
+    cdef:
+        cnp.intp_t root_cluster
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
+        cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array
+        cnp.ndarray[cnp.float64_t, ndim=1] lambda_array
+        TreeUnionFind union_find
+        cnp.intp_t n, parent, child, cluster
+        cnp.float64_t threshold
+
+    child_array = condensed_tree['child']
+    parent_array = condensed_tree['parent']
+    lambda_array = condensed_tree['value']
+
+    root_cluster = np.min(parent_array)
+    result = np.empty(root_cluster, dtype=np.intp)
+    union_find = TreeUnionFind(np.max(parent_array) + 1)
+
+    for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        child = child_array[n]
+        parent = parent_array[n]
+        if child not in clusters:
+            union_find.union(parent, child)
+
+    for n in range(root_cluster):
+        cluster = union_find.find(n)
+        label = NOISE
+        if cluster != root_cluster:
+            label = cluster_label_map[cluster]
+        elif len(clusters) == 1 and allow_single_cluster:
+            # There can only be one edge with this particular child hence this
+            # expression extracts a unique, scalar lambda value.
+            parent_lambda = lambda_array[child_array == n]
+            if cluster_selection_epsilon != 0.0:
+                threshold = 1 / cluster_selection_epsilon
+            else:
+                # The threshold should be calculated per-sample based on the
+                # largest lambda of any simbling node.
+                threshold = lambda_array[parent_array == cluster].max()
+            if parent_lambda >= threshold:
+                label = cluster_label_map[cluster]
+
+        result[n] = label
+
+    return result
+
+
+cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
+    dict cluster_map,
+    cnp.intp_t[::1] labels
+):
+
+    cdef:
+        cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result
+        cnp.float64_t[:] lambda_array
+        cnp.float64_t[::1] deaths
+        cnp.intp_t[:] child_array, parent_array
+        cnp.intp_t root_cluster, n, point, cluster_num, cluster
+        cnp.float64_t max_lambda, lambda_val
+
+    child_array = condensed_tree['child']
+    parent_array = condensed_tree['parent']
+    lambda_array = condensed_tree['value']
+
+    result = np.zeros(labels.shape[0])
+    deaths = max_lambdas(condensed_tree)
+    root_cluster = np.min(parent_array)
+
+    for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        point = child_array[n]
+        if point >= root_cluster:
+            continue
+
+        cluster_num = labels[point]
+        if cluster_num == -1:
+            continue
+
+        cluster = cluster_map[cluster_num]
+        max_lambda = deaths[cluster]
+        if max_lambda == 0.0 or isinf(lambda_array[n]):
+            result[point] = 1.0
+        else:
+            lambda_val = min(lambda_array[n], max_lambda)
+            result[point] = lambda_val / max_lambda
+
+    return result
+
+
+cpdef list recurse_leaf_dfs(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
+    cnp.intp_t current_node
+):
+    cdef cnp.intp_t[:] children
+    cdef cnp.intp_t child
+
+    children = cluster_tree[cluster_tree['parent'] == current_node]['child']
+    if children.shape[0] == 0:
+        return [current_node,]
+    else:
+        return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], [])
+
+
+cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree):
+    cdef cnp.intp_t root
+    if PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] == 0:
+        return []
+    root = cluster_tree['parent'].min()
+    return recurse_leaf_dfs(cluster_tree, root)
+
+cdef cnp.intp_t traverse_upwards(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
+    cnp.float64_t cluster_selection_epsilon,
+    cnp.intp_t leaf,
+    cnp.intp_t allow_single_cluster
+):
+    cdef cnp.intp_t root, parent
+    cdef cnp.float64_t parent_eps
+
+    root = cluster_tree['parent'].min()
+    parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
+    if parent == root:
+        if allow_single_cluster:
+            return parent
+        else:
+            return leaf  # return node closest to root
+
+    parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value']
+    if parent_eps > cluster_selection_epsilon:
+        return parent
+    else:
+        return traverse_upwards(
+            cluster_tree,
+            cluster_selection_epsilon,
+            parent,
+            allow_single_cluster
+        )
+
+cdef set epsilon_search(
+    set leaves,
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
+    cnp.float64_t cluster_selection_epsilon,
+    cnp.intp_t allow_single_cluster
+):
+    cdef:
+        list selected_clusters = list()
+        list processed = list()
+        cnp.intp_t leaf, epsilon_child, sub_node
+        cnp.float64_t eps
+        cnp.uint8_t[:] leaf_nodes
+        cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child']
+        cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value']
+
+    for leaf in leaves:
+        leaf_nodes = children == leaf
+        eps = 1 / distances[leaf_nodes][0]
+        if eps < cluster_selection_epsilon:
+            if leaf not in processed:
+                epsilon_child = traverse_upwards(
+                    cluster_tree,
+                    cluster_selection_epsilon,
+                    leaf,
+                    allow_single_cluster
+                )
+                selected_clusters.append(epsilon_child)
+
+                for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
+                    if sub_node != epsilon_child:
+                        processed.append(sub_node)
+        else:
+            selected_clusters.append(leaf)
+
+    return set(selected_clusters)
+
+
+@cython.wraparound(True)
+cdef tuple _get_clusters(
+    cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
+    dict stability,
+    cluster_selection_method='eom',
+    cnp.uint8_t allow_single_cluster=False,
+    cnp.float64_t cluster_selection_epsilon=0.0,
+    max_cluster_size=None
+):
+    """Given a tree and stability dict, produce the cluster labels
+    (and probabilities) for a flat clustering based on the chosen
+    cluster selection method.
+
+    Parameters
+    ----------
+    condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
+        Effectively an edgelist encoding a parent/child pair, along with a
+        value and the corresponding cluster_size in each row providing a tree
+        structure.
+
+    stability : dict
+        A dictionary mapping cluster_ids to stability values
+
+    cluster_selection_method : string, optional (default 'eom')
+        The method of selecting clusters. The default is the
+        Excess of Mass algorithm specified by 'eom'. The alternate
+        option is 'leaf'.
+
+    allow_single_cluster : boolean, optional (default False)
+        Whether to allow a single cluster to be selected by the
+        Excess of Mass algorithm.
+
+    cluster_selection_epsilon: double, optional (default 0.0)
+        A distance threshold for cluster splits.
+
+    max_cluster_size: int, default=None
+        The maximum size for clusters located by the EOM clusterer. Can
+        be overridden by the cluster_selection_epsilon parameter in
+        rare cases.
+
+    Returns
+    -------
+    labels : ndarray of shape (n_samples,)
+        An integer array of cluster labels, with -1 denoting noise.
+
+    probabilities : ndarray (n_samples,)
+        The cluster membership strength of each sample.
+
+    stabilities : ndarray (n_clusters,)
+        The cluster coherence strengths of each cluster.
+    """
+    cdef:
+        list node_list
+        cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree
+        cnp.uint8_t[::1] child_selection
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
+        dict is_cluster, cluster_sizes
+        cnp.float64_t subtree_stability
+        cnp.intp_t node, sub_node, cluster, n_samples
+        cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs
+
+    # Assume clusters are ordered by numeric id equivalent to
+    # a topological sort of the tree; This is valid given the
+    # current implementation above, so don't change that ... or
+    # if you do, change this accordingly!
+    if allow_single_cluster:
+        node_list = sorted(stability.keys(), reverse=True)
+    else:
+        node_list = sorted(stability.keys(), reverse=True)[:-1]
+        # (exclude root)
+
+    cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1]
+    is_cluster = {cluster: True for cluster in node_list}
+    n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1
+
+    if max_cluster_size is None:
+        max_cluster_size = n_samples + 1  # Set to a value that will never be triggered
+    cluster_sizes = {
+        child: cluster_size for child, cluster_size
+        in zip(cluster_tree['child'], cluster_tree['cluster_size'])
+    }
+    if allow_single_cluster:
+        # Compute cluster size for the root node
+        cluster_sizes[node_list[-1]] = np.sum(
+            cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size'])
+
+    if cluster_selection_method == 'eom':
+        for node in node_list:
+            child_selection = (cluster_tree['parent'] == node)
+            subtree_stability = np.sum([
+                stability[child] for
+                child in cluster_tree['child'][child_selection]])
+            if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size:
+                is_cluster[node] = False
+                stability[node] = subtree_stability
+            else:
+                for sub_node in bfs_from_cluster_tree(cluster_tree, node):
+                    if sub_node != node:
+                        is_cluster[sub_node] = False
+
+        if cluster_selection_epsilon != 0.0 and PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] > 0:
+            eom_clusters = [c for c in is_cluster if is_cluster[c]]
+            selected_clusters = []
+            # first check if eom_clusters only has root node, which skips epsilon check.
+            if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()):
+                if allow_single_cluster:
+                    selected_clusters = eom_clusters
+            else:
+                selected_clusters = epsilon_search(
+                    set(eom_clusters),
+                    cluster_tree,
+                    cluster_selection_epsilon,
+                    allow_single_cluster
+                )
+            for c in is_cluster:
+                if c in selected_clusters:
+                    is_cluster[c] = True
+                else:
+                    is_cluster[c] = False
+
+    elif cluster_selection_method == 'leaf':
+        leaves = set(get_cluster_tree_leaves(cluster_tree))
+        if len(leaves) == 0:
+            for c in is_cluster:
+                is_cluster[c] = False
+            is_cluster[condensed_tree['parent'].min()] = True
+
+        if cluster_selection_epsilon != 0.0:
+            selected_clusters = epsilon_search(
+                leaves,
+                cluster_tree,
+                cluster_selection_epsilon,
+                allow_single_cluster
+            )
+        else:
+            selected_clusters = leaves
+
+        for c in is_cluster:
+            if c in selected_clusters:
+                is_cluster[c] = True
+            else:
+                is_cluster[c] = False
+
+    clusters = set([c for c in is_cluster if is_cluster[c]])
+    cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
+    reverse_cluster_map = {n: c for c, n in cluster_map.items()}
+
+    labels = _do_labelling(
+        condensed_tree,
+        clusters,
+        cluster_map,
+        allow_single_cluster,
+        cluster_selection_epsilon
+    )
+    probs = get_probabilities(condensed_tree, reverse_cluster_map, labels)
+
+    return (labels, probs)
@@ -0,0 +1,16 @@
+cluster_hdbscan_extension_metadata = {
+  '_linkage': {'sources': ['_linkage.pyx', metrics_cython_tree]},
+  '_reachability': {'sources': ['_reachability.pyx']},
+  '_tree': {'sources': ['_tree.pyx']}
+}
+
+foreach ext_name, ext_dict : cluster_hdbscan_extension_metadata
+  py.extension_module(
+    ext_name,
+    ext_dict.get('sources'),
+    dependencies: [np_dep],
+    cython_args: cython_args,
+    subdir: 'sklearn/cluster/_hdbscan',
+    install: true
+  )
+endforeach
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+
+from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_allclose,
+)
+
+
+def test_mutual_reachability_graph_error_sparse_format():
+    """Check that we raise an error if the sparse format is not CSR."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 10)
+    X = X.T @ X
+    np.fill_diagonal(X, 0.0)
+    X = _convert_container(X, "sparse_csc")
+
+    err_msg = "Only sparse CSR matrices are supported"
+    with pytest.raises(ValueError, match=err_msg):
+        mutual_reachability_graph(X)
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
+def test_mutual_reachability_graph_inplace(array_type):
+    """Check that the operation is happening inplace."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 10)
+    X = X.T @ X
+    np.fill_diagonal(X, 0.0)
+    X = _convert_container(X, array_type)
+
+    mr_graph = mutual_reachability_graph(X)
+
+    assert id(mr_graph) == id(X)
+
+
+def test_mutual_reachability_graph_equivalence_dense_sparse():
+    """Check that we get the same results for dense and sparse implementation."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(5, 5)
+    X_dense = X.T @ X
+    X_sparse = _convert_container(X_dense, "sparse_csr")
+
+    mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
+    mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
+
+    assert_allclose(mr_graph_dense, mr_graph_sparse.toarray())
+
+
+@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_mutual_reachability_graph_preserve_dtype(array_type, dtype):
+    """Check that the computation preserve dtype thanks to fused types."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 10)
+    X = (X.T @ X).astype(dtype)
+    np.fill_diagonal(X, 0.0)
+    X = _convert_container(X, array_type)
+
+    assert X.dtype == dtype
+    mr_graph = mutual_reachability_graph(X)
+    assert mr_graph.dtype == dtype
@@ -0,0 +1,9 @@
+from ..utils._typedefs cimport intp_t
+
+cdef class UnionFind:
+    cdef intp_t next_label
+    cdef intp_t[:] parent
+    cdef intp_t[:] size
+
+    cdef void union(self, intp_t m, intp_t n) noexcept
+    cdef intp_t fast_find(self, intp_t n) noexcept
@@ -0,0 +1,506 @@
+# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
+
+import numpy as np
+cimport cython
+
+from ..metrics._dist_metrics cimport DistanceMetric64
+from ..utils._fast_dict cimport IntFloatDict
+from ..utils._typedefs cimport float64_t, intp_t, uint8_t
+
+# C++
+from cython.operator cimport dereference as deref, preincrement as inc
+from libcpp.map cimport map as cpp_map
+from libc.math cimport fmax, INFINITY
+
+
+###############################################################################
+# Utilities for computing the ward momentum
+
+def compute_ward_dist(
+    const float64_t[::1] m_1,
+    const float64_t[:, ::1] m_2,
+    const intp_t[::1] coord_row,
+    const intp_t[::1] coord_col,
+    float64_t[::1] res
+):
+    cdef intp_t size_max = coord_row.shape[0]
+    cdef intp_t n_features = m_2.shape[1]
+    cdef intp_t i, j, row, col
+    cdef float64_t pa, n
+
+    for i in range(size_max):
+        row = coord_row[i]
+        col = coord_col[i]
+        n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col])
+        pa = 0.
+        for j in range(n_features):
+            pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2
+        res[i] = pa * n
+
+
+###############################################################################
+# Utilities for cutting and exploring a hierarchical tree
+
+def _hc_get_descendent(intp_t node, children, intp_t n_leaves):
+    """
+    Function returning all the descendent leaves of a set of nodes in the tree.
+
+    Parameters
+    ----------
+    node : integer
+        The node for which we want the descendents.
+
+    children : list of pairs, length n_nodes
+        The children of each non-leaf node. Values less than `n_samples` refer
+        to leaves of the tree. A greater value `i` indicates a node with
+        children `children[i - n_samples]`.
+
+    n_leaves : integer
+        Number of leaves.
+
+    Returns
+    -------
+    descendent : list of int
+    """
+    ind = [node]
+    if node < n_leaves:
+        return ind
+    descendent = []
+
+    # It is actually faster to do the accounting of the number of
+    # elements is the list ourselves: len is a lengthy operation on a
+    # chained list
+    cdef intp_t i, n_indices = 1
+
+    while n_indices:
+        i = ind.pop()
+        if i < n_leaves:
+            descendent.append(i)
+            n_indices -= 1
+        else:
+            ind.extend(children[i - n_leaves])
+            n_indices += 1
+    return descendent
+
+
+def hc_get_heads(intp_t[:] parents, copy=True):
+    """Returns the heads of the forest, as defined by parents.
+
+    Parameters
+    ----------
+    parents : array of integers
+        The parent structure defining the forest (ensemble of trees)
+    copy : boolean
+        If copy is False, the input 'parents' array is modified inplace
+
+    Returns
+    -------
+    heads : array of integers of same shape as parents
+        The indices in the 'parents' of the tree heads
+
+    """
+    cdef intp_t parent, node0, node, size
+    if copy:
+        parents = np.copy(parents)
+    size = parents.size
+
+    # Start from the top of the tree and go down
+    for node0 in range(size - 1, -1, -1):
+        node = node0
+        parent = parents[node]
+        while parent != node:
+            parents[node0] = parent
+            node = parent
+            parent = parents[node]
+    return parents
+
+
+def _get_parents(
+    nodes,
+    heads,
+    const intp_t[:] parents,
+    uint8_t[::1] not_visited
+):
+    """Returns the heads of the given nodes, as defined by parents.
+
+    Modifies 'heads' and 'not_visited' in-place.
+
+    Parameters
+    ----------
+    nodes : list of integers
+        The nodes to start from
+    heads : list of integers
+        A list to hold the results (modified inplace)
+    parents : array of integers
+        The parent structure defining the tree
+    not_visited
+        The tree nodes to consider (modified inplace)
+
+    """
+    cdef intp_t parent, node
+
+    for node in nodes:
+        parent = parents[node]
+        while parent != node:
+            node = parent
+            parent = parents[node]
+        if not_visited[node]:
+            not_visited[node] = 0
+            heads.append(node)
+
+
+###############################################################################
+# merge strategies implemented on IntFloatDicts
+
+# These are used in the hierarchical clustering code, to implement
+# merging between two clusters, defined as a dict containing node number
+# as keys and edge weights as values.
+
+
+def max_merge(
+    IntFloatDict a,
+    IntFloatDict b,
+    const intp_t[:] mask,
+    intp_t n_a,
+    intp_t n_b
+):
+    """Merge two IntFloatDicts with the max strategy: when the same key is
+    present in the two dicts, the max of the two values is used.
+
+    Parameters
+    ==========
+    a, b : IntFloatDict object
+        The IntFloatDicts to merge
+    mask : ndarray array of dtype integer and of dimension 1
+        a mask for keys to ignore: if not mask[key] the corresponding key
+        is skipped in the output dictionary
+    n_a, n_b : float
+        n_a and n_b are weights for a and b for the merge strategy.
+        They are not used in the case of a max merge.
+
+    Returns
+    =======
+    out : IntFloatDict object
+        The IntFloatDict resulting from the merge
+    """
+    cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
+    cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
+    cdef intp_t key
+    cdef float64_t value
+    # First copy a into out
+    while a_it != a_end:
+        key = deref(a_it).first
+        if mask[key]:
+            out_obj.my_map[key] = deref(a_it).second
+        inc(a_it)
+
+    # Then merge b into out
+    cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
+    cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
+    while b_it != b_end:
+        key = deref(b_it).first
+        value = deref(b_it).second
+        if mask[key]:
+            out_it = out_obj.my_map.find(key)
+            if out_it == out_end:
+                # Key not found
+                out_obj.my_map[key] = value
+            else:
+                deref(out_it).second = fmax(deref(out_it).second, value)
+        inc(b_it)
+    return out_obj
+
+
+def average_merge(
+    IntFloatDict a,
+    IntFloatDict b,
+    const intp_t[:] mask,
+    intp_t n_a,
+    intp_t n_b
+):
+    """Merge two IntFloatDicts with the average strategy: when the
+    same key is present in the two dicts, the weighted average of the two
+    values is used.
+
+    Parameters
+    ==========
+    a, b : IntFloatDict object
+        The IntFloatDicts to merge
+    mask : ndarray array of dtype integer and of dimension 1
+        a mask for keys to ignore: if not mask[key] the corresponding key
+        is skipped in the output dictionary
+    n_a, n_b : float
+        n_a and n_b are weights for a and b for the merge strategy.
+        They are used for a weighted mean.
+
+    Returns
+    =======
+    out : IntFloatDict object
+        The IntFloatDict resulting from the merge
+    """
+    cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
+    cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
+    cdef intp_t key
+    cdef float64_t value
+    cdef float64_t n_out = <float64_t> (n_a + n_b)
+    # First copy a into out
+    while a_it != a_end:
+        key = deref(a_it).first
+        if mask[key]:
+            out_obj.my_map[key] = deref(a_it).second
+        inc(a_it)
+
+    # Then merge b into out
+    cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
+    cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
+    cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
+    while b_it != b_end:
+        key = deref(b_it).first
+        value = deref(b_it).second
+        if mask[key]:
+            out_it = out_obj.my_map.find(key)
+            if out_it == out_end:
+                # Key not found
+                out_obj.my_map[key] = value
+            else:
+                deref(out_it).second = (n_a * deref(out_it).second
+                                        + n_b * value) / n_out
+        inc(b_it)
+    return out_obj
+
+
+###############################################################################
+# An edge object for fast comparisons
+
+cdef class WeightedEdge:
+    cdef public intp_t a
+    cdef public intp_t b
+    cdef public float64_t weight
+
+    def __init__(self, float64_t weight, intp_t a, intp_t b):
+        self.weight = weight
+        self.a = a
+        self.b = b
+
+    def __richcmp__(self, WeightedEdge other, int op):
+        """Cython-specific comparison method.
+
+        op is the comparison code::
+            <   0
+            ==  2
+            >   4
+            <=  1
+            !=  3
+            >=  5
+        """
+        if op == 0:
+            return self.weight < other.weight
+        elif op == 1:
+            return self.weight <= other.weight
+        elif op == 2:
+            return self.weight == other.weight
+        elif op == 3:
+            return self.weight != other.weight
+        elif op == 4:
+            return self.weight > other.weight
+        elif op == 5:
+            return self.weight >= other.weight
+
+    def __repr__(self):
+        return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
+                                              self.weight,
+                                              self.a, self.b)
+
+
+################################################################################
+# Efficient labelling/conversion of MSTs to single linkage hierarchies
+
+cdef class UnionFind(object):
+
+    def __init__(self, N):
+        self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C')
+        self.next_label = N
+        self.size = np.hstack((np.ones(N, dtype=np.intp),
+                               np.zeros(N - 1, dtype=np.intp)))
+
+    cdef void union(self, intp_t m, intp_t n) noexcept:
+        self.parent[m] = self.next_label
+        self.parent[n] = self.next_label
+        self.size[self.next_label] = self.size[m] + self.size[n]
+        self.next_label += 1
+        return
+
+    @cython.wraparound(True)
+    cdef intp_t fast_find(self, intp_t n) noexcept:
+        cdef intp_t p
+        p = n
+        # find the highest node in the linkage graph so far
+        while self.parent[n] != -1:
+            n = self.parent[n]
+        # provide a shortcut up to the highest node
+        while self.parent[p] != n:
+            p, self.parent[p] = self.parent[p], n
+        return n
+
+
+def _single_linkage_label(const float64_t[:, :] L):
+    """
+    Convert an linkage array or MST to a tree by labelling clusters at merges.
+    This is done by using a Union find structure to keep track of merges
+    efficiently. This is the private version of the function that assumes that
+    ``L`` has been properly validated. See ``single_linkage_label`` for the
+    user facing version of this function.
+
+    Parameters
+    ----------
+    L: array of shape (n_samples - 1, 3)
+        The linkage array or MST where each row specifies two samples
+        to be merged and a distance or weight at which the merge occurs. This
+         array is assumed to be sorted by the distance/weight.
+
+    Returns
+    -------
+    A tree in the format used by scipy.cluster.hierarchy.
+    """
+
+    cdef float64_t[:, ::1] result_arr
+
+    cdef intp_t left, left_cluster, right, right_cluster, index
+    cdef float64_t delta
+
+    result_arr = np.zeros((L.shape[0], 4), dtype=np.float64)
+    U = UnionFind(L.shape[0] + 1)
+
+    for index in range(L.shape[0]):
+
+        left = <intp_t> L[index, 0]
+        right = <intp_t> L[index, 1]
+        delta = L[index, 2]
+
+        left_cluster = U.fast_find(left)
+        right_cluster = U.fast_find(right)
+
+        result_arr[index][0] = left_cluster
+        result_arr[index][1] = right_cluster
+        result_arr[index][2] = delta
+        result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster]
+
+        U.union(left_cluster, right_cluster)
+
+    return np.asarray(result_arr)
+
+
+@cython.wraparound(True)
+def single_linkage_label(L):
+    """
+    Convert an linkage array or MST to a tree by labelling clusters at merges.
+    This is done by using a Union find structure to keep track of merges
+    efficiently.
+
+    Parameters
+    ----------
+    L: array of shape (n_samples - 1, 3)
+        The linkage array or MST where each row specifies two samples
+        to be merged and a distance or weight at which the merge occurs. This
+         array is assumed to be sorted by the distance/weight.
+
+    Returns
+    -------
+    A tree in the format used by scipy.cluster.hierarchy.
+    """
+    # Validate L
+    if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1:
+        raise ValueError("Input MST array is not a validly formatted MST array")
+
+    is_sorted = lambda x: np.all(x[:-1] <= x[1:])
+    if not is_sorted(L[:, 2]):
+        raise ValueError("Input MST array must be sorted by weight")
+
+    return _single_linkage_label(L)
+
+
+# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
+def mst_linkage_core(
+        const float64_t [:, ::1] raw_data,
+        DistanceMetric64 dist_metric):
+    """
+    Compute the necessary elements of a minimum spanning
+    tree for computation of single linkage clustering. This
+    represents the MST-LINKAGE-CORE algorithm (Figure 6) from
+    :arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering
+    algorithms" <1109.2378>`.
+
+    In contrast to the scipy implementation is never computes
+    a full distance matrix, generating distances only as they
+    are needed and releasing them when no longer needed.
+
+    Parameters
+    ----------
+    raw_data: array of shape (n_samples, n_features)
+        The array of feature data to be clustered. Must be C-aligned
+
+    dist_metric: DistanceMetric64
+        A DistanceMetric64 object conforming to the API from
+        ``sklearn.metrics._dist_metrics.pxd`` that will be
+        used to compute distances.
+
+    Returns
+    -------
+    mst_core_data: array of shape (n_samples, 3)
+        An array providing information from which one
+        can either compute an MST, or the linkage hierarchy
+        very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical,
+        agglomerative clustering algorithms" <1109.2378>` algorithm
+        MST-LINKAGE-CORE for more details.
+    """
+    cdef:
+        intp_t n_samples = raw_data.shape[0]
+        uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool)
+        float64_t[:, ::1] result = np.zeros((n_samples - 1, 3))
+
+        intp_t current_node = 0
+        intp_t new_node
+        intp_t i
+        intp_t j
+        intp_t num_features = raw_data.shape[1]
+
+        float64_t right_value
+        float64_t left_value
+        float64_t new_distance
+
+        float64_t[:] current_distances = np.full(n_samples, INFINITY)
+
+    for i in range(n_samples - 1):
+
+        in_tree[current_node] = 1
+
+        new_distance = INFINITY
+        new_node = 0
+
+        for j in range(n_samples):
+            if in_tree[j]:
+                continue
+
+            right_value = current_distances[j]
+            left_value = dist_metric.dist(&raw_data[current_node, 0],
+                                          &raw_data[j, 0],
+                                          num_features)
+
+            if left_value < right_value:
+                current_distances[j] = left_value
+
+            if current_distances[j] < new_distance:
+                new_distance = current_distances[j]
+                new_node = j
+
+        result[i, 0] = current_node
+        result[i, 1] = new_node
+        result[i, 2] = new_distance
+        current_node = new_node
+
+    return np.array(result)
@@ -0,0 +1,48 @@
+from cython cimport floating
+
+
+cdef floating _euclidean_dense_dense(
+    const floating*,
+    const floating*,
+    int,
+    bint
+) noexcept nogil
+
+cdef floating _euclidean_sparse_dense(
+    const floating[::1],
+    const int[::1],
+    const floating[::1],
+    floating,
+    bint
+) noexcept nogil
+
+cpdef void _relocate_empty_clusters_dense(
+    const floating[:, ::1],
+    const floating[::1],
+    const floating[:, ::1],
+    floating[:, ::1],
+    floating[::1],
+    const int[::1]
+)
+
+cpdef void _relocate_empty_clusters_sparse(
+    const floating[::1],
+    const int[::1],
+    const int[::1],
+    const floating[::1],
+    const floating[:, ::1],
+    floating[:, ::1],
+    floating[::1],
+    const int[::1]
+)
+
+cdef void _average_centers(
+    floating[:, ::1],
+    const floating[::1]
+)
+
+cdef void _center_shift(
+    const floating[:, ::1],
+    const floating[:, ::1],
+    floating[::1]
+)
@@ -0,0 +1,331 @@
+# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#         Olivier Grisel <olivier.grisel@ensta.org>
+#         Lars Buitinck
+#
+# License: BSD 3 clause
+
+import numpy as np
+from cython cimport floating
+from cython.parallel cimport prange
+from libc.math cimport sqrt
+
+from ..utils.extmath import row_norms
+
+
+# Number of samples per data chunk defined as a global constant.
+CHUNK_SIZE = 256
+
+
+cdef floating _euclidean_dense_dense(
+        const floating* a,  # IN
+        const floating* b,  # IN
+        int n_features,
+        bint squared
+) noexcept nogil:
+    """Euclidean distance between a dense and b dense"""
+    cdef:
+        int i
+        int n = n_features // 4
+        int rem = n_features % 4
+        floating result = 0
+
+    # We manually unroll the loop for better cache optimization.
+    for i in range(n):
+        result += (
+            (a[0] - b[0]) * (a[0] - b[0]) +
+            (a[1] - b[1]) * (a[1] - b[1]) +
+            (a[2] - b[2]) * (a[2] - b[2]) +
+            (a[3] - b[3]) * (a[3] - b[3])
+        )
+        a += 4
+        b += 4
+
+    for i in range(rem):
+        result += (a[i] - b[i]) * (a[i] - b[i])
+
+    return result if squared else sqrt(result)
+
+
+def _euclidean_dense_dense_wrapper(
+    const floating[::1] a,
+    const floating[::1] b,
+    bint squared
+):
+    """Wrapper of _euclidean_dense_dense for testing purpose"""
+    return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared)
+
+
+cdef floating _euclidean_sparse_dense(
+        const floating[::1] a_data,  # IN
+        const int[::1] a_indices,    # IN
+        const floating[::1] b,       # IN
+        floating b_squared_norm,
+        bint squared
+) noexcept nogil:
+    """Euclidean distance between a sparse and b dense"""
+    cdef:
+        int nnz = a_indices.shape[0]
+        int i
+        floating tmp, bi
+        floating result = 0.0
+
+    for i in range(nnz):
+        bi = b[a_indices[i]]
+        tmp = a_data[i] - bi
+        result += tmp * tmp - bi * bi
+
+    result += b_squared_norm
+
+    if result < 0:
+        result = 0.0
+
+    return result if squared else sqrt(result)
+
+
+def _euclidean_sparse_dense_wrapper(
+        const floating[::1] a_data,
+        const int[::1] a_indices,
+        const floating[::1] b,
+        floating b_squared_norm,
+        bint squared
+):
+    """Wrapper of _euclidean_sparse_dense for testing purpose"""
+    return _euclidean_sparse_dense(
+        a_data, a_indices, b, b_squared_norm, squared)
+
+
+cpdef floating _inertia_dense(
+        const floating[:, ::1] X,           # IN
+        const floating[::1] sample_weight,  # IN
+        const floating[:, ::1] centers,     # IN
+        const int[::1] labels,              # IN
+        int n_threads,
+        int single_label=-1,
+):
+    """Compute inertia for dense input data
+
+    Sum of squared distance between each sample and its assigned center.
+
+    If single_label is >= 0, the inertia is computed only for that label.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int i, j
+
+        floating sq_dist = 0.0
+        floating inertia = 0.0
+
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
+        j = labels[i]
+        if single_label < 0 or single_label == j:
+            sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
+                                             n_features, True)
+            inertia += sq_dist * sample_weight[i]
+
+    return inertia
+
+
+cpdef floating _inertia_sparse(
+        X,                                  # IN
+        const floating[::1] sample_weight,  # IN
+        const floating[:, ::1] centers,     # IN
+        const int[::1] labels,              # IN
+        int n_threads,
+        int single_label=-1,
+):
+    """Compute inertia for sparse input data
+
+    Sum of squared distance between each sample and its assigned center.
+
+    If single_label is >= 0, the inertia is computed only for that label.
+    """
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        int n_samples = X.shape[0]
+        int i, j
+
+        floating sq_dist = 0.0
+        floating inertia = 0.0
+
+        floating[::1] centers_squared_norms = row_norms(centers, squared=True)
+
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
+        j = labels[i]
+        if single_label < 0 or single_label == j:
+            sq_dist = _euclidean_sparse_dense(
+                X_data[X_indptr[i]: X_indptr[i + 1]],
+                X_indices[X_indptr[i]: X_indptr[i + 1]],
+                centers[j], centers_squared_norms[j], True)
+            inertia += sq_dist * sample_weight[i]
+
+    return inertia
+
+
+cpdef void _relocate_empty_clusters_dense(
+        const floating[:, ::1] X,            # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # INOUT
+        floating[::1] weight_in_clusters,    # INOUT
+        const int[::1] labels                # IN
+):
+    """Relocate centers which have no sample assigned to them."""
+    cdef:
+        int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
+        int n_empty = empty_clusters.shape[0]
+
+    if n_empty == 0:
+        return
+
+    cdef:
+        int n_features = X.shape[1]
+
+        floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
+        int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
+
+        int new_cluster_id, old_cluster_id, far_idx, idx, k
+        floating weight
+
+    if np.max(distances) == 0:
+        # Happens when there are more clusters than non-duplicate samples. Relocating
+        # is pointless in this case.
+        return
+
+    for idx in range(n_empty):
+
+        new_cluster_id = empty_clusters[idx]
+
+        far_idx = far_from_centers[idx]
+        weight = sample_weight[far_idx]
+
+        old_cluster_id = labels[far_idx]
+
+        for k in range(n_features):
+            centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
+            centers_new[new_cluster_id, k] = X[far_idx, k] * weight
+
+        weight_in_clusters[new_cluster_id] = weight
+        weight_in_clusters[old_cluster_id] -= weight
+
+
+cpdef void _relocate_empty_clusters_sparse(
+        const floating[::1] X_data,          # IN
+        const int[::1] X_indices,            # IN
+        const int[::1] X_indptr,             # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # INOUT
+        floating[::1] weight_in_clusters,    # INOUT
+        const int[::1] labels                # IN
+):
+    """Relocate centers which have no sample assigned to them."""
+    cdef:
+        int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
+        int n_empty = empty_clusters.shape[0]
+
+    if n_empty == 0:
+        return
+
+    cdef:
+        int n_samples = X_indptr.shape[0] - 1
+        int i, j, k
+
+        floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype)
+        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
+
+    for i in range(n_samples):
+        j = labels[i]
+        distances[i] = _euclidean_sparse_dense(
+            X_data[X_indptr[i]: X_indptr[i + 1]],
+            X_indices[X_indptr[i]: X_indptr[i + 1]],
+            centers_old[j], centers_squared_norms[j], True)
+
+    if np.max(distances) == 0:
+        # Happens when there are more clusters than non-duplicate samples. Relocating
+        # is pointless in this case.
+        return
+
+    cdef:
+        int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
+
+        int new_cluster_id, old_cluster_id, far_idx, idx
+        floating weight
+
+    for idx in range(n_empty):
+
+        new_cluster_id = empty_clusters[idx]
+
+        far_idx = far_from_centers[idx]
+        weight = sample_weight[far_idx]
+
+        old_cluster_id = labels[far_idx]
+
+        for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]):
+            centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight
+            centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight
+
+        weight_in_clusters[new_cluster_id] = weight
+        weight_in_clusters[old_cluster_id] -= weight
+
+
+cdef void _average_centers(
+        floating[:, ::1] centers,               # INOUT
+        const floating[::1] weight_in_clusters  # IN
+):
+    """Average new centers wrt weights."""
+    cdef:
+        int n_clusters = centers.shape[0]
+        int n_features = centers.shape[1]
+        int j, k
+        floating alpha
+        int argmax_weight = np.argmax(weight_in_clusters)
+
+    for j in range(n_clusters):
+        if weight_in_clusters[j] > 0:
+            alpha = 1.0 / weight_in_clusters[j]
+            for k in range(n_features):
+                centers[j, k] *= alpha
+        else:
+            # For convenience, we avoid setting empty clusters at the origin but place
+            # them at the location of the biggest cluster.
+            for k in range(n_features):
+                centers[j, k] = centers[argmax_weight, k]
+
+
+cdef void _center_shift(
+        const floating[:, ::1] centers_old,  # IN
+        const floating[:, ::1] centers_new,  # IN
+        floating[::1] center_shift           # OUT
+):
+    """Compute shift between old and new centers."""
+    cdef:
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+        int j
+
+    for j in range(n_clusters):
+        center_shift[j] = _euclidean_dense_dense(
+            &centers_new[j, 0], &centers_old[j, 0], n_features, False)
+
+
+def _is_same_clustering(
+    const int[::1] labels1,
+    const int[::1] labels2,
+    n_clusters
+):
+    """Check if two arrays of labels are the same up to a permutation of the labels"""
+    cdef int[::1] mapping = np.full(fill_value=-1, shape=(n_clusters,), dtype=np.int32)
+    cdef int i
+
+    for i in range(labels1.shape[0]):
+        if mapping[labels1[i]] == -1:
+            mapping[labels1[i]] = labels2[i]
+        elif mapping[labels1[i]] != labels2[i]:
+            return False
+    return True
@@ -0,0 +1,687 @@
+# Author: Andreas Mueller
+#
+# Licence: BSD 3 clause
+
+from cython cimport floating
+from cython.parallel import prange, parallel
+from libc.stdlib cimport calloc, free
+from libc.string cimport memset
+
+from ..utils._openmp_helpers cimport omp_lock_t
+from ..utils._openmp_helpers cimport omp_init_lock
+from ..utils._openmp_helpers cimport omp_destroy_lock
+from ..utils._openmp_helpers cimport omp_set_lock
+from ..utils._openmp_helpers cimport omp_unset_lock
+from ..utils.extmath import row_norms
+from ._k_means_common import CHUNK_SIZE
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _euclidean_dense_dense
+from ._k_means_common cimport _euclidean_sparse_dense
+from ._k_means_common cimport _average_centers
+from ._k_means_common cimport _center_shift
+
+
+def init_bounds_dense(
+        const floating[:, ::1] X,                      # IN
+        const floating[:, ::1] centers,                # IN
+        const floating[:, ::1] center_half_distances,  # IN
+        int[::1] labels,                               # OUT
+        floating[::1] upper_bounds,                    # OUT
+        floating[:, ::1] lower_bounds,                 # OUT
+        int n_threads):
+    """Initialize upper and lower bounds for each sample for dense input data.
+
+    Given X, centers and the pairwise distances divided by 2.0 between the
+    centers this calculates the upper bounds and lower bounds for each sample.
+    The upper bound for each sample is set to the distance between the sample
+    and the closest center.
+
+    The lower bound for each sample is a one-dimensional array of n_clusters.
+    For each sample i assume that the previously assigned cluster is c1 and the
+    previous closest distance is dist, for a new cluster c2, the
+    lower_bound[i][c2] is set to distance between the sample and this new
+    cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
+    computation of unnecessary distances for each sample to the clusters that
+    it is unlikely to be assigned to.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The input data.
+
+    centers : ndarray of shape (n_clusters, n_features), dtype=floating
+        The cluster centers.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        The half of the distance between any 2 clusters centers.
+
+    labels : ndarray of shape(n_samples), dtype=int
+        The label for each sample. This array is modified in place.
+
+    upper_bounds : ndarray of shape(n_samples,), dtype=floating
+        The upper bound on the distance between each sample and its closest
+        cluster center. This array is modified in place.
+
+    lower_bounds : ndarray, of shape(n_samples, n_clusters), dtype=floating
+        The lower bound on the distance between each sample and each cluster
+        center. This array is modified in place.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers.shape[0]
+        int n_features = X.shape[1]
+
+        floating min_dist, dist
+        int best_cluster, i, j
+
+    for i in prange(
+        n_samples, num_threads=n_threads, schedule='static', nogil=True
+    ):
+        best_cluster = 0
+        min_dist = _euclidean_dense_dense(&X[i, 0], &centers[0, 0],
+                                          n_features, False)
+        lower_bounds[i, 0] = min_dist
+        for j in range(1, n_clusters):
+            if min_dist > center_half_distances[best_cluster, j]:
+                dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
+                                              n_features, False)
+                lower_bounds[i, j] = dist
+                if dist < min_dist:
+                    min_dist = dist
+                    best_cluster = j
+        labels[i] = best_cluster
+        upper_bounds[i] = min_dist
+
+
+def init_bounds_sparse(
+        X,                                             # IN
+        const floating[:, ::1] centers,                # IN
+        const floating[:, ::1] center_half_distances,  # IN
+        int[::1] labels,                               # OUT
+        floating[::1] upper_bounds,                    # OUT
+        floating[:, ::1] lower_bounds,                 # OUT
+        int n_threads):
+    """Initialize upper and lower bounds for each sample for sparse input data.
+
+    Given X, centers and the pairwise distances divided by 2.0 between the
+    centers this calculates the upper bounds and lower bounds for each sample.
+    The upper bound for each sample is set to the distance between the sample
+    and the closest center.
+
+    The lower bound for each sample is a one-dimensional array of n_clusters.
+    For each sample i assume that the previously assigned cluster is c1 and the
+    previous closest distance is dist, for a new cluster c2, the
+    lower_bound[i][c2] is set to distance between the sample and this new
+    cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
+    computation of unnecessary distances for each sample to the clusters that
+    it is unlikely to be assigned to.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features), dtype=floating
+        The input data. Must be in CSR format.
+
+    centers : ndarray of shape (n_clusters, n_features), dtype=floating
+        The cluster centers.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        The half of the distance between any 2 clusters centers.
+
+    labels : ndarray of shape(n_samples), dtype=int
+        The label for each sample. This array is modified in place.
+
+    upper_bounds : ndarray of shape(n_samples,), dtype=floating
+        The upper bound on the distance between each sample and its closest
+        cluster center. This array is modified in place.
+
+    lower_bounds : ndarray of shape(n_samples, n_clusters), dtype=floating
+        The lower bound on the distance between each sample and each cluster
+        center. This array is modified in place.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers.shape[0]
+
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        floating min_dist, dist
+        int best_cluster, i, j
+
+        floating[::1] centers_squared_norms = row_norms(centers, squared=True)
+
+    for i in prange(
+        n_samples, num_threads=n_threads, schedule='static', nogil=True
+    ):
+        best_cluster = 0
+        min_dist = _euclidean_sparse_dense(
+            X_data[X_indptr[i]: X_indptr[i + 1]],
+            X_indices[X_indptr[i]: X_indptr[i + 1]],
+            centers[0], centers_squared_norms[0], False)
+
+        lower_bounds[i, 0] = min_dist
+        for j in range(1, n_clusters):
+            if min_dist > center_half_distances[best_cluster, j]:
+                dist = _euclidean_sparse_dense(
+                    X_data[X_indptr[i]: X_indptr[i + 1]],
+                    X_indices[X_indptr[i]: X_indptr[i + 1]],
+                    centers[j], centers_squared_norms[j], False)
+                lower_bounds[i, j] = dist
+                if dist < min_dist:
+                    min_dist = dist
+                    best_cluster = j
+        labels[i] = best_cluster
+        upper_bounds[i] = min_dist
+
+
+def elkan_iter_chunked_dense(
+        const floating[:, ::1] X,                      # IN
+        const floating[::1] sample_weight,             # IN
+        const floating[:, ::1] centers_old,            # IN
+        floating[:, ::1] centers_new,                  # OUT
+        floating[::1] weight_in_clusters,              # OUT
+        const floating[:, ::1] center_half_distances,  # IN
+        const floating[::1] distance_next_center,      # IN
+        floating[::1] upper_bounds,                    # INOUT
+        floating[:, ::1] lower_bounds,                 # INOUT
+        int[::1] labels,                               # INOUT
+        floating[::1] center_shift,                    # OUT
+        int n_threads,
+        bint update_centers=True):
+    """Single iteration of K-means Elkan algorithm with dense input.
+
+    Update labels and centers (inplace), for one iteration, distributed
+    over data chunks.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The observations to cluster.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
+        Placeholder for the sums of the weights of every observation assigned
+        to each center.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        Half pairwise distances between centers.
+
+    distance_next_center : ndarray of shape (n_clusters,), dtype=floating
+        Distance between each center its closest center.
+
+    upper_bounds : ndarray of shape (n_samples,), dtype=floating
+        Upper bound for the distance between each sample and its center,
+        updated inplace.
+
+    lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
+        Lower bound for the distance between each sample and each center,
+        updated inplace.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    center_shift : ndarray of shape (n_clusters,), dtype=floating
+        Distance between old and new centers.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+
+    update_centers : bool
+        - If True, the labels and the new centers will be computed, i.e. runs
+          the E-step and the M-step of the algorithm.
+        - If False, only the labels will be computed, i.e runs the E-step of
+          the algorithm. This is useful especially when calling predict on a
+          fitted model.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int n_clusters = centers_new.shape[0]
+
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outiers.
+        return
+
+    cdef:
+        # hard-coded number of samples per chunk. Splitting in chunks is
+        # necessary to get parallelism. Chunk size chosen to be same as lloyd's
+        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
+        int n_chunks = n_samples // n_samples_chunk
+        int n_samples_rem = n_samples % n_samples_chunk
+        int chunk_idx
+        int start, end
+
+        int i, j, k
+
+        floating *centers_new_chunk
+        floating *weight_in_clusters_chunk
+
+        omp_lock_t lock
+
+    # count remainder chunk in total number of chunks
+    n_chunks += n_samples != n_chunks * n_samples_chunk
+
+    # number of threads should not be bigger than number of chunks
+    n_threads = min(n_threads, n_chunks)
+
+    if update_centers:
+        memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
+        memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
+        omp_init_lock(&lock)
+
+    with nogil, parallel(num_threads=n_threads):
+        # thread local buffers
+        centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
+        weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
+
+        for chunk_idx in prange(n_chunks, schedule='static'):
+            start = chunk_idx * n_samples_chunk
+            if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
+                end = start + n_samples_rem
+            else:
+                end = start + n_samples_chunk
+
+            _update_chunk_dense(
+                X[start: end],
+                sample_weight[start: end],
+                centers_old,
+                center_half_distances,
+                distance_next_center,
+                labels[start: end],
+                upper_bounds[start: end],
+                lower_bounds[start: end],
+                centers_new_chunk,
+                weight_in_clusters_chunk,
+                update_centers)
+
+        # reduction from local buffers.
+        if update_centers:
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
+            for j in range(n_clusters):
+                weight_in_clusters[j] += weight_in_clusters_chunk[j]
+                for k in range(n_features):
+                    centers_new[j, k] += centers_new_chunk[j * n_features + k]
+            omp_unset_lock(&lock)
+
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
+    if update_centers:
+        omp_destroy_lock(&lock)
+        _relocate_empty_clusters_dense(X, sample_weight, centers_old,
+                                       centers_new, weight_in_clusters, labels)
+
+        _average_centers(centers_new, weight_in_clusters)
+        _center_shift(centers_old, centers_new, center_shift)
+
+        # update lower and upper bounds
+        for i in range(n_samples):
+            upper_bounds[i] += center_shift[labels[i]]
+
+            for j in range(n_clusters):
+                lower_bounds[i, j] -= center_shift[j]
+                if lower_bounds[i, j] < 0:
+                    lower_bounds[i, j] = 0
+
+
+cdef void _update_chunk_dense(
+        const floating[:, ::1] X,                      # IN
+        const floating[::1] sample_weight,             # IN
+        const floating[:, ::1] centers_old,            # IN
+        const floating[:, ::1] center_half_distances,  # IN
+        const floating[::1] distance_next_center,      # IN
+        int[::1] labels,                               # INOUT
+        floating[::1] upper_bounds,                    # INOUT
+        floating[:, ::1] lower_bounds,                 # INOUT
+        floating *centers_new,                         # OUT
+        floating *weight_in_clusters,                  # OUT
+        bint update_centers) noexcept nogil:
+    """K-means combined EM step for one dense data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating upper_bound, distance
+        int i, j, k, label
+
+    for i in range(n_samples):
+        upper_bound = upper_bounds[i]
+        bounds_tight = 0
+        label = labels[i]
+
+        # Next center is not far away from the currently assigned center.
+        # Sample might need to be assigned to another center.
+        if not distance_next_center[label] >= upper_bound:
+
+            for j in range(n_clusters):
+
+                # If this holds, then center_index is a good candidate for the
+                # sample to be relabelled, and we need to confirm this by
+                # recomputing the upper and lower bounds.
+                if (
+                    j != label
+                    and (upper_bound > lower_bounds[i, j])
+                    and (upper_bound > center_half_distances[label, j])
+                ):
+
+                    # Recompute upper bound by calculating the actual distance
+                    # between the sample and its current assigned center.
+                    if not bounds_tight:
+                        upper_bound = _euclidean_dense_dense(
+                            &X[i, 0], &centers_old[label, 0], n_features, False)
+                        lower_bounds[i, label] = upper_bound
+                        bounds_tight = 1
+
+                    # If the condition still holds, then compute the actual
+                    # distance between the sample and center. If this is less
+                    # than the previous distance, reassign label.
+                    if (
+                        upper_bound > lower_bounds[i, j]
+                        or (upper_bound > center_half_distances[label, j])
+                    ):
+
+                        distance = _euclidean_dense_dense(
+                            &X[i, 0], &centers_old[j, 0], n_features, False)
+                        lower_bounds[i, j] = distance
+                        if distance < upper_bound:
+                            label = j
+                            upper_bound = distance
+
+            labels[i] = label
+            upper_bounds[i] = upper_bound
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(n_features):
+                centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
+
+
+def elkan_iter_chunked_sparse(
+        X,                                             # IN
+        const floating[::1] sample_weight,             # IN
+        const floating[:, ::1] centers_old,            # IN
+        floating[:, ::1] centers_new,                  # OUT
+        floating[::1] weight_in_clusters,              # OUT
+        const floating[:, ::1] center_half_distances,  # IN
+        const floating[::1] distance_next_center,      # IN
+        floating[::1] upper_bounds,                    # INOUT
+        floating[:, ::1] lower_bounds,                 # INOUT
+        int[::1] labels,                               # INOUT
+        floating[::1] center_shift,                    # OUT
+        int n_threads,
+        bint update_centers=True):
+    """Single iteration of K-means Elkan algorithm with sparse input.
+
+    Update labels and centers (inplace), for one iteration, distributed
+    over data chunks.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features)
+        The observations to cluster. Must be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
+        Placeholder for the sums of the weights of every observation assigned
+        to each center.
+
+    center_half_distances : ndarray of shape (n_clusters, n_clusters), \
+            dtype=floating
+        Half pairwise distances between centers.
+
+    distance_next_center : ndarray of shape (n_clusters,), dtype=floating
+        Distance between each center its closest center.
+
+    upper_bounds : ndarray of shape (n_samples,), dtype=floating
+        Upper bound for the distance between each sample and its center,
+        updated inplace.
+
+    lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
+        Lower bound for the distance between each sample and each center,
+        updated inplace.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    center_shift : ndarray of shape (n_clusters,), dtype=floating
+        Distance between old and new centers.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+
+    update_centers : bool
+        - If True, the labels and the new centers will be computed, i.e. runs
+          the E-step and the M-step of the algorithm.
+        - If False, only the labels will be computed, i.e runs the E-step of
+          the algorithm. This is useful especially when calling predict on a
+          fitted model.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int n_clusters = centers_new.shape[0]
+
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outiers.
+        return
+
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        # hard-coded number of samples per chunk. Splitting in chunks is
+        # necessary to get parallelism. Chunk size chosen to be same as lloyd's
+        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
+        int n_chunks = n_samples // n_samples_chunk
+        int n_samples_rem = n_samples % n_samples_chunk
+        int chunk_idx
+        int start, end
+
+        int i, j, k
+
+        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
+
+        floating *centers_new_chunk
+        floating *weight_in_clusters_chunk
+
+        omp_lock_t lock
+
+    # count remainder chunk in total number of chunks
+    n_chunks += n_samples != n_chunks * n_samples_chunk
+
+    # number of threads should not be bigger than number of chunks
+    n_threads = min(n_threads, n_chunks)
+
+    if update_centers:
+        memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
+        memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
+        omp_init_lock(&lock)
+
+    with nogil, parallel(num_threads=n_threads):
+        # thread local buffers
+        centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
+        weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
+
+        for chunk_idx in prange(n_chunks, schedule='static'):
+            start = chunk_idx * n_samples_chunk
+            if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
+                end = start + n_samples_rem
+            else:
+                end = start + n_samples_chunk
+
+            _update_chunk_sparse(
+                X_data[X_indptr[start]: X_indptr[end]],
+                X_indices[X_indptr[start]: X_indptr[end]],
+                X_indptr[start: end+1],
+                sample_weight[start: end],
+                centers_old,
+                centers_squared_norms,
+                center_half_distances,
+                distance_next_center,
+                labels[start: end],
+                upper_bounds[start: end],
+                lower_bounds[start: end],
+                centers_new_chunk,
+                weight_in_clusters_chunk,
+                update_centers)
+
+        # reduction from local buffers.
+        if update_centers:
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
+            for j in range(n_clusters):
+                weight_in_clusters[j] += weight_in_clusters_chunk[j]
+                for k in range(n_features):
+                    centers_new[j, k] += centers_new_chunk[j * n_features + k]
+            omp_unset_lock(&lock)
+
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
+    if update_centers:
+        omp_destroy_lock(&lock)
+        _relocate_empty_clusters_sparse(
+            X_data, X_indices, X_indptr, sample_weight,
+            centers_old, centers_new, weight_in_clusters, labels)
+
+        _average_centers(centers_new, weight_in_clusters)
+        _center_shift(centers_old, centers_new, center_shift)
+
+        # update lower and upper bounds
+        for i in range(n_samples):
+            upper_bounds[i] += center_shift[labels[i]]
+
+            for j in range(n_clusters):
+                lower_bounds[i, j] -= center_shift[j]
+                if lower_bounds[i, j] < 0:
+                    lower_bounds[i, j] = 0
+
+
+cdef void _update_chunk_sparse(
+        const floating[::1] X_data,                    # IN
+        const int[::1] X_indices,                      # IN
+        const int[::1] X_indptr,                       # IN
+        const floating[::1] sample_weight,             # IN
+        const floating[:, ::1] centers_old,            # IN
+        const floating[::1] centers_squared_norms,     # IN
+        const floating[:, ::1] center_half_distances,  # IN
+        const floating[::1] distance_next_center,      # IN
+        int[::1] labels,                               # INOUT
+        floating[::1] upper_bounds,                    # INOUT
+        floating[:, ::1] lower_bounds,                 # INOUT
+        floating *centers_new,                         # OUT
+        floating *weight_in_clusters,                  # OUT
+        bint update_centers) noexcept nogil:
+    """K-means combined EM step for one sparse data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating upper_bound, distance
+        int i, j, k, label
+        int s = X_indptr[0]
+
+    for i in range(n_samples):
+        upper_bound = upper_bounds[i]
+        bounds_tight = 0
+        label = labels[i]
+
+        # Next center is not far away from the currently assigned center.
+        # Sample might need to be assigned to another center.
+        if not distance_next_center[label] >= upper_bound:
+
+            for j in range(n_clusters):
+
+                # If this holds, then center_index is a good candidate for the
+                # sample to be relabelled, and we need to confirm this by
+                # recomputing the upper and lower bounds.
+                if (
+                    j != label
+                    and (upper_bound > lower_bounds[i, j])
+                    and (upper_bound > center_half_distances[label, j])
+                ):
+
+                    # Recompute upper bound by calculating the actual distance
+                    # between the sample and its current assigned center.
+                    if not bounds_tight:
+                        upper_bound = _euclidean_sparse_dense(
+                            X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            centers_old[label], centers_squared_norms[label], False)
+                        lower_bounds[i, label] = upper_bound
+                        bounds_tight = 1
+
+                    # If the condition still holds, then compute the actual
+                    # distance between the sample and center. If this is less
+                    # than the previous distance, reassign label.
+                    if (
+                        upper_bound > lower_bounds[i, j]
+                        or (upper_bound > center_half_distances[label, j])
+                    ):
+                        distance = _euclidean_sparse_dense(
+                            X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
+                            centers_old[j], centers_squared_norms[j], False)
+                        lower_bounds[i, j] = distance
+                        if distance < upper_bound:
+                            label = j
+                            upper_bound = distance
+
+            labels[i] = label
+            upper_bounds[i] = upper_bound
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
+                centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
@@ -0,0 +1,420 @@
+# Licence: BSD 3 clause
+
+from cython cimport floating
+from cython.parallel import prange, parallel
+from libc.stdlib cimport malloc, calloc, free
+from libc.string cimport memset
+from libc.float cimport DBL_MAX, FLT_MAX
+
+from ..utils._openmp_helpers cimport omp_lock_t
+from ..utils._openmp_helpers cimport omp_init_lock
+from ..utils._openmp_helpers cimport omp_destroy_lock
+from ..utils._openmp_helpers cimport omp_set_lock
+from ..utils._openmp_helpers cimport omp_unset_lock
+from ..utils.extmath import row_norms
+from ..utils._cython_blas cimport _gemm
+from ..utils._cython_blas cimport RowMajor, Trans, NoTrans
+from ._k_means_common import CHUNK_SIZE
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _average_centers, _center_shift
+
+
+def lloyd_iter_chunked_dense(
+        const floating[:, ::1] X,            # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_in_clusters,    # OUT
+        int[::1] labels,                     # OUT
+        floating[::1] center_shift,          # OUT
+        int n_threads,
+        bint update_centers=True):
+    """Single iteration of K-means lloyd algorithm with dense input.
+
+    Update labels and centers (inplace), for one iteration, distributed
+    over data chunks.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The observations to cluster.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration. `centers_new` can be `None` if
+        `update_centers` is False.
+
+    weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
+        Placeholder for the sums of the weights of every observation assigned
+        to each center. `weight_in_clusters` can be `None` if `update_centers`
+        is False.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    center_shift : ndarray of shape (n_clusters,), dtype=floating
+        Distance between old and new centers.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+
+    update_centers : bool
+        - If True, the labels and the new centers will be computed, i.e. runs
+          the E-step and the M-step of the algorithm.
+        - If False, only the labels will be computed, i.e runs the E-step of
+          the algorithm. This is useful especially when calling predict on a
+          fitted model.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int n_clusters = centers_old.shape[0]
+
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outiers.
+        return
+
+    cdef:
+        # hard-coded number of samples per chunk. Appeared to be close to
+        # optimal in all situations.
+        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
+        int n_chunks = n_samples // n_samples_chunk
+        int n_samples_rem = n_samples % n_samples_chunk
+        int chunk_idx
+        int start, end
+
+        int j, k
+
+        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
+
+        floating *centers_new_chunk
+        floating *weight_in_clusters_chunk
+        floating *pairwise_distances_chunk
+
+        omp_lock_t lock
+
+    # count remainder chunk in total number of chunks
+    n_chunks += n_samples != n_chunks * n_samples_chunk
+
+    # number of threads should not be bigger than number of chunks
+    n_threads = min(n_threads, n_chunks)
+
+    if update_centers:
+        memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
+        memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
+        omp_init_lock(&lock)
+
+    with nogil, parallel(num_threads=n_threads):
+        # thread local buffers
+        centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
+        weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
+        pairwise_distances_chunk = <floating*> malloc(n_samples_chunk * n_clusters * sizeof(floating))
+
+        for chunk_idx in prange(n_chunks, schedule='static'):
+            start = chunk_idx * n_samples_chunk
+            if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
+                end = start + n_samples_rem
+            else:
+                end = start + n_samples_chunk
+
+            _update_chunk_dense(
+                X[start: end],
+                sample_weight[start: end],
+                centers_old,
+                centers_squared_norms,
+                labels[start: end],
+                centers_new_chunk,
+                weight_in_clusters_chunk,
+                pairwise_distances_chunk,
+                update_centers)
+
+        # reduction from local buffers.
+        if update_centers:
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
+            for j in range(n_clusters):
+                weight_in_clusters[j] += weight_in_clusters_chunk[j]
+                for k in range(n_features):
+                    centers_new[j, k] += centers_new_chunk[j * n_features + k]
+
+            omp_unset_lock(&lock)
+
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+        free(pairwise_distances_chunk)
+
+    if update_centers:
+        omp_destroy_lock(&lock)
+        _relocate_empty_clusters_dense(
+            X, sample_weight, centers_old, centers_new, weight_in_clusters, labels
+        )
+
+        _average_centers(centers_new, weight_in_clusters)
+        _center_shift(centers_old, centers_new, center_shift)
+
+
+cdef void _update_chunk_dense(
+        const floating[:, ::1] X,                   # IN
+        const floating[::1] sample_weight,          # IN
+        const floating[:, ::1] centers_old,         # IN
+        const floating[::1] centers_squared_norms,  # IN
+        int[::1] labels,                            # OUT
+        floating *centers_new,                      # OUT
+        floating *weight_in_clusters,               # OUT
+        floating *pairwise_distances,               # OUT
+        bint update_centers) noexcept nogil:
+    """K-means combined EM step for one dense data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating sq_dist, min_sq_dist
+        int i, j, k, label
+
+    # Instead of computing the full pairwise squared distances matrix,
+    # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store
+    # the - 2 X.C^T + ||C||² term since the argmin for a given sample only
+    # depends on the centers.
+    # pairwise_distances = ||C||²
+    for i in range(n_samples):
+        for j in range(n_clusters):
+            pairwise_distances[i * n_clusters + j] = centers_squared_norms[j]
+
+    # pairwise_distances += -2 * X.dot(C.T)
+    _gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
+          -2.0, &X[0, 0], n_features, &centers_old[0, 0], n_features,
+          1.0, pairwise_distances, n_clusters)
+
+    for i in range(n_samples):
+        min_sq_dist = pairwise_distances[i * n_clusters]
+        label = 0
+        for j in range(1, n_clusters):
+            sq_dist = pairwise_distances[i * n_clusters + j]
+            if sq_dist < min_sq_dist:
+                min_sq_dist = sq_dist
+                label = j
+        labels[i] = label
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(n_features):
+                centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
+
+
+def lloyd_iter_chunked_sparse(
+        X,                                   # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_in_clusters,    # OUT
+        int[::1] labels,                     # OUT
+        floating[::1] center_shift,          # OUT
+        int n_threads,
+        bint update_centers=True):
+    """Single iteration of K-means lloyd algorithm with sparse input.
+
+    Update labels and centers (inplace), for one iteration, distributed
+    over data chunks.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features), dtype=floating
+        The observations to cluster. Must be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration. `centers_new` can be `None` if
+        `update_centers` is False.
+
+    weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
+        Placeholder for the sums of the weights of every observation assigned
+        to each center. `weight_in_clusters` can be `None` if `update_centers`
+        is False.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    center_shift : ndarray of shape (n_clusters,), dtype=floating
+        Distance between old and new centers.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+
+    update_centers : bool
+        - If True, the labels and the new centers will be computed, i.e. runs
+          the E-step and the M-step of the algorithm.
+        - If False, only the labels will be computed, i.e runs the E-step of
+          the algorithm. This is useful especially when calling predict on a
+          fitted model.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int n_clusters = centers_old.shape[0]
+
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outiers.
+        return
+
+    cdef:
+        # Choose same as for dense. Does not have the same impact since with
+        # sparse data the pairwise distances matrix is not precomputed.
+        # However, splitting in chunks is necessary to get parallelism.
+        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
+        int n_chunks = n_samples // n_samples_chunk
+        int n_samples_rem = n_samples % n_samples_chunk
+        int chunk_idx
+        int start = 0, end = 0
+
+        int j, k
+
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+
+        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
+
+        floating *centers_new_chunk
+        floating *weight_in_clusters_chunk
+
+        omp_lock_t lock
+
+    # count remainder chunk in total number of chunks
+    n_chunks += n_samples != n_chunks * n_samples_chunk
+
+    # number of threads should not be bigger than number of chunks
+    n_threads = min(n_threads, n_chunks)
+
+    if update_centers:
+        memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
+        memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
+        omp_init_lock(&lock)
+
+    with nogil, parallel(num_threads=n_threads):
+        # thread local buffers
+        centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
+        weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
+
+        for chunk_idx in prange(n_chunks, schedule='static'):
+            start = chunk_idx * n_samples_chunk
+            if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
+                end = start + n_samples_rem
+            else:
+                end = start + n_samples_chunk
+
+            _update_chunk_sparse(
+                X_data[X_indptr[start]: X_indptr[end]],
+                X_indices[X_indptr[start]: X_indptr[end]],
+                X_indptr[start: end+1],
+                sample_weight[start: end],
+                centers_old,
+                centers_squared_norms,
+                labels[start: end],
+                centers_new_chunk,
+                weight_in_clusters_chunk,
+                update_centers)
+
+        # reduction from local buffers.
+        if update_centers:
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
+            for j in range(n_clusters):
+                weight_in_clusters[j] += weight_in_clusters_chunk[j]
+                for k in range(n_features):
+                    centers_new[j, k] += centers_new_chunk[j * n_features + k]
+            omp_unset_lock(&lock)
+
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
+    if update_centers:
+        omp_destroy_lock(&lock)
+        _relocate_empty_clusters_sparse(
+            X_data, X_indices, X_indptr, sample_weight,
+            centers_old, centers_new, weight_in_clusters, labels)
+
+        _average_centers(centers_new, weight_in_clusters)
+        _center_shift(centers_old, centers_new, center_shift)
+
+
+cdef void _update_chunk_sparse(
+        const floating[::1] X_data,                 # IN
+        const int[::1] X_indices,                   # IN
+        const int[::1] X_indptr,                    # IN
+        const floating[::1] sample_weight,          # IN
+        const floating[:, ::1] centers_old,         # IN
+        const floating[::1] centers_squared_norms,  # IN
+        int[::1] labels,                            # OUT
+        floating *centers_new,                      # OUT
+        floating *weight_in_clusters,               # OUT
+        bint update_centers) noexcept nogil:
+    """K-means combined EM step for one sparse data chunk.
+
+    Compute the partial contribution of a single data chunk to the labels and
+    centers.
+    """
+    cdef:
+        int n_samples = labels.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+
+        floating sq_dist, min_sq_dist
+        int i, j, k, label
+        floating max_floating = FLT_MAX if floating is float else DBL_MAX
+        int s = X_indptr[0]
+
+    # XXX Precompute the pairwise distances matrix is not worth for sparse
+    # currently. Should be tested when BLAS (sparse x dense) matrix
+    # multiplication is available.
+    for i in range(n_samples):
+        min_sq_dist = max_floating
+        label = 0
+
+        for j in range(n_clusters):
+            sq_dist = 0.0
+            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
+                sq_dist += centers_old[j, X_indices[k]] * X_data[k]
+
+            # Instead of computing the full squared distance with each cluster,
+            # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute
+            # the - 2 X.C^T + ||C||² term since the argmin for a given sample
+            # only depends on the centers C.
+            sq_dist = centers_squared_norms[j] -2 * sq_dist
+            if sq_dist < min_sq_dist:
+                min_sq_dist = sq_dist
+                label = j
+
+        labels[i] = label
+
+        if update_centers:
+            weight_in_clusters[label] += sample_weight[i]
+            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
+                centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
@@ -0,0 +1,218 @@
+from cython cimport floating
+from cython.parallel cimport parallel, prange
+from libc.stdlib cimport malloc, free
+
+
+def _minibatch_update_dense(
+        const floating[:, ::1] X,            # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_sums,           # INOUT
+        const int[::1] labels,               # IN
+        int n_threads):
+    """Update of the centers for dense MiniBatchKMeans.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The observations to cluster.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_sums : ndarray of shape (n_clusters,), dtype=floating
+        Current sums of the accumulated weights for each center.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int cluster_idx
+
+        int *indices
+
+    with nogil, parallel(num_threads=n_threads):
+        indices = <int*> malloc(n_samples * sizeof(int))
+
+        for cluster_idx in prange(n_clusters, schedule="static"):
+            update_center_dense(cluster_idx, X, sample_weight,
+                                centers_old, centers_new, weight_sums, labels,
+                                indices)
+
+        free(indices)
+
+
+cdef void update_center_dense(
+        int cluster_idx,
+        const floating[:, ::1] X,            # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_sums,           # INOUT
+        const int[::1] labels,               # IN
+        int *indices) noexcept nogil:        # TMP
+    """Update of a single center for dense MinibatchKMeans"""
+    cdef:
+        int n_samples = sample_weight.shape[0]
+        int n_features = centers_old.shape[1]
+        floating alpha
+        int n_indices
+        int k, sample_idx, feature_idx
+
+        floating wsum = 0
+
+    # indices = np.where(labels == cluster_idx)[0]
+    k = 0
+    for sample_idx in range(n_samples):
+        if labels[sample_idx] == cluster_idx:
+            indices[k] = sample_idx
+            wsum += sample_weight[sample_idx]
+            k += 1
+    n_indices = k
+
+    if wsum > 0:
+        # Undo the previous count-based scaling for this cluster center
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
+
+        # Update cluster with new point members
+        for k in range(n_indices):
+            sample_idx = indices[k]
+            for feature_idx in range(n_features):
+                centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx]
+
+        # Update the count statistics for this center
+        weight_sums[cluster_idx] += wsum
+
+        # Rescale to compute mean of all points (old and new)
+        alpha = 1 / weight_sums[cluster_idx]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] *= alpha
+    else:
+        # No sample was assigned to this cluster in this batch of data
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
+
+
+def _minibatch_update_sparse(
+        X,                                   # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_sums,           # INOUT
+        const int[::1] labels,               # IN
+        int n_threads):
+    """Update of the centers for sparse MiniBatchKMeans.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features), dtype=floating
+        The observations to cluster. Must be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_sums : ndarray of shape (n_clusters,), dtype=floating
+        Current sums of the accumulated weights for each center.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+        int n_samples = X.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int cluster_idx
+
+        int *indices
+
+    with nogil, parallel(num_threads=n_threads):
+        indices = <int*> malloc(n_samples * sizeof(int))
+
+        for cluster_idx in prange(n_clusters, schedule="static"):
+            update_center_sparse(cluster_idx, X_data, X_indices, X_indptr,
+                                 sample_weight, centers_old, centers_new,
+                                 weight_sums, labels, indices)
+
+        free(indices)
+
+
+cdef void update_center_sparse(
+        int cluster_idx,
+        const floating[::1] X_data,          # IN
+        const int[::1] X_indices,            # IN
+        const int[::1] X_indptr,             # IN
+        const floating[::1] sample_weight,   # IN
+        const floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,        # OUT
+        floating[::1] weight_sums,           # INOUT
+        const int[::1] labels,               # IN
+        int *indices) noexcept nogil:        # TMP
+    """Update of a single center for sparse MinibatchKMeans"""
+    cdef:
+        int n_samples = sample_weight.shape[0]
+        int n_features = centers_old.shape[1]
+        floating alpha
+        int n_indices
+        int k, sample_idx, feature_idx
+
+        floating wsum = 0
+
+    # indices = np.where(labels == cluster_idx)[0]
+    k = 0
+    for sample_idx in range(n_samples):
+        if labels[sample_idx] == cluster_idx:
+            indices[k] = sample_idx
+            wsum += sample_weight[sample_idx]
+            k += 1
+    n_indices = k
+
+    if wsum > 0:
+        # Undo the previous count-based scaling for this cluster center:
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
+
+        # Update cluster with new point members
+        for k in range(n_indices):
+            sample_idx = indices[k]
+            for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
+                centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx]
+
+        # Update the count statistics for this center
+        weight_sums[cluster_idx] += wsum
+
+        # Rescale to compute mean of all points (old and new)
+        alpha = 1 / weight_sums[cluster_idx]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] *= alpha
+    else:
+        # No sample was assigned to this cluster in this batch of data
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
@@ -0,0 +1,575 @@
+"""Mean shift clustering algorithm.
+
+Mean shift clustering aims to discover *blobs* in a smooth density of
+samples. It is a centroid based algorithm, which works by updating candidates
+for centroids to be the mean of the points within a given region. These
+candidates are then filtered in a post-processing stage to eliminate
+near-duplicates to form the final set of centroids.
+
+Seeding is performed using a binning technique for scalability.
+"""
+
+# Authors: Conrad Lee <conradlee@gmail.com>
+#          Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#          Gael Varoquaux <gael.varoquaux@normalesup.org>
+#          Martino Sorbaro <martino.sorbaro@ed.ac.uk>
+
+import warnings
+from collections import defaultdict
+from numbers import Integral, Real
+
+import numpy as np
+
+from .._config import config_context
+from ..base import BaseEstimator, ClusterMixin, _fit_context
+from ..metrics.pairwise import pairwise_distances_argmin
+from ..neighbors import NearestNeighbors
+from ..utils import check_array, check_random_state, gen_batches
+from ..utils._param_validation import Interval, validate_params
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import check_is_fitted
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "quantile": [Interval(Real, 0, 1, closed="both")],
+        "n_samples": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "n_jobs": [Integral, None],
+    },
+    prefer_skip_nested_validation=True,
+)
+def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0, n_jobs=None):
+    """Estimate the bandwidth to use with the mean-shift algorithm.
+
+    This function takes time at least quadratic in `n_samples`. For large
+    datasets, it is wise to subsample by setting `n_samples`. Alternatively,
+    the parameter `bandwidth` can be set to a small value without estimating
+    it.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Input points.
+
+    quantile : float, default=0.3
+        Should be between [0, 1]
+        0.5 means that the median of all pairwise distances is used.
+
+    n_samples : int, default=None
+        The number of samples to use. If not given, all samples are used.
+
+    random_state : int, RandomState instance, default=None
+        The generator used to randomly select the samples from input points
+        for bandwidth estimation. Use an int to make the randomness
+        deterministic.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    bandwidth : float
+        The bandwidth parameter.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import estimate_bandwidth
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> estimate_bandwidth(X, quantile=0.5)
+    1.61...
+    """
+    X = check_array(X)
+
+    random_state = check_random_state(random_state)
+    if n_samples is not None:
+        idx = random_state.permutation(X.shape[0])[:n_samples]
+        X = X[idx]
+    n_neighbors = int(X.shape[0] * quantile)
+    if n_neighbors < 1:  # cannot fit NearestNeighbors with n_neighbors = 0
+        n_neighbors = 1
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=n_jobs)
+    nbrs.fit(X)
+
+    bandwidth = 0.0
+    for batch in gen_batches(len(X), 500):
+        d, _ = nbrs.kneighbors(X[batch, :], return_distance=True)
+        bandwidth += np.max(d, axis=1).sum()
+
+    return bandwidth / X.shape[0]
+
+
+# separate function for each seed's iterative loop
+def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
+    # For each seed, climb gradient until convergence or max_iter
+    bandwidth = nbrs.get_params()["radius"]
+    stop_thresh = 1e-3 * bandwidth  # when mean has converged
+    completed_iterations = 0
+    while True:
+        # Find mean of points within bandwidth
+        i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0]
+        points_within = X[i_nbrs]
+        if len(points_within) == 0:
+            break  # Depending on seeding strategy this condition may occur
+        my_old_mean = my_mean  # save the old mean
+        my_mean = np.mean(points_within, axis=0)
+        # If converged or at max_iter, adds the cluster
+        if (
+            np.linalg.norm(my_mean - my_old_mean) <= stop_thresh
+            or completed_iterations == max_iter
+        ):
+            break
+        completed_iterations += 1
+    return tuple(my_mean), len(points_within), completed_iterations
+
+
+@validate_params(
+    {"X": ["array-like"]},
+    prefer_skip_nested_validation=False,
+)
+def mean_shift(
+    X,
+    *,
+    bandwidth=None,
+    seeds=None,
+    bin_seeding=False,
+    min_bin_freq=1,
+    cluster_all=True,
+    max_iter=300,
+    n_jobs=None,
+):
+    """Perform mean shift clustering of data using a flat kernel.
+
+    Read more in the :ref:`User Guide <mean_shift>`.
+
+    Parameters
+    ----------
+
+    X : array-like of shape (n_samples, n_features)
+        Input data.
+
+    bandwidth : float, default=None
+        Kernel bandwidth. If not None, must be in the range [0, +inf).
+
+        If None, the bandwidth is determined using a heuristic based on
+        the median of all pairwise distances. This will take quadratic time in
+        the number of samples. The sklearn.cluster.estimate_bandwidth function
+        can be used to do this more efficiently.
+
+    seeds : array-like of shape (n_seeds, n_features) or None
+        Point used as initial kernel locations. If None and bin_seeding=False,
+        each data point is used as a seed. If None and bin_seeding=True,
+        see bin_seeding.
+
+    bin_seeding : bool, default=False
+        If true, initial kernel locations are not locations of all
+        points, but rather the location of the discretized version of
+        points, where points are binned onto a grid whose coarseness
+        corresponds to the bandwidth. Setting this option to True will speed
+        up the algorithm because fewer seeds will be initialized.
+        Ignored if seeds argument is not None.
+
+    min_bin_freq : int, default=1
+       To speed up the algorithm, accept only those bins with at least
+       min_bin_freq points as seeds.
+
+    cluster_all : bool, default=True
+        If true, then all points are clustered, even those orphans that are
+        not within any kernel. Orphans are assigned to the nearest kernel.
+        If false, then orphans are given cluster label -1.
+
+    max_iter : int, default=300
+        Maximum number of iterations, per seed point before the clustering
+        operation terminates (for that seed point), if has not converged yet.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. The following tasks benefit
+        from the parallelization:
+
+        - The search of nearest neighbors for bandwidth estimation and label
+          assignments. See the details in the docstring of the
+          ``NearestNeighbors`` class.
+        - Hill-climbing optimization for all seeds.
+
+        See :term:`Glossary <n_jobs>` for more details.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 0.17
+           Parallel Execution using *n_jobs*.
+
+    Returns
+    -------
+
+    cluster_centers : ndarray of shape (n_clusters, n_features)
+        Coordinates of cluster centers.
+
+    labels : ndarray of shape (n_samples,)
+        Cluster labels for each point.
+
+    Notes
+    -----
+    For an example, see :ref:`examples/cluster/plot_mean_shift.py
+    <sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.cluster import mean_shift
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> cluster_centers, labels = mean_shift(X, bandwidth=2)
+    >>> cluster_centers
+    array([[3.33..., 6.     ],
+           [1.33..., 0.66...]])
+    >>> labels
+    array([1, 1, 1, 0, 0, 0])
+    """
+    model = MeanShift(
+        bandwidth=bandwidth,
+        seeds=seeds,
+        min_bin_freq=min_bin_freq,
+        bin_seeding=bin_seeding,
+        cluster_all=cluster_all,
+        n_jobs=n_jobs,
+        max_iter=max_iter,
+    ).fit(X)
+    return model.cluster_centers_, model.labels_
+
+
+def get_bin_seeds(X, bin_size, min_bin_freq=1):
+    """Find seeds for mean_shift.
+
+    Finds seeds by first binning data onto a grid whose lines are
+    spaced bin_size apart, and then choosing those bins with at least
+    min_bin_freq points.
+
+    Parameters
+    ----------
+
+    X : array-like of shape (n_samples, n_features)
+        Input points, the same points that will be used in mean_shift.
+
+    bin_size : float
+        Controls the coarseness of the binning. Smaller values lead
+        to more seeding (which is computationally more expensive). If you're
+        not sure how to set this, set it to the value of the bandwidth used
+        in clustering.mean_shift.
+
+    min_bin_freq : int, default=1
+        Only bins with at least min_bin_freq will be selected as seeds.
+        Raising this value decreases the number of seeds found, which
+        makes mean_shift computationally cheaper.
+
+    Returns
+    -------
+    bin_seeds : array-like of shape (n_samples, n_features)
+        Points used as initial kernel positions in clustering.mean_shift.
+    """
+    if bin_size == 0:
+        return X
+
+    # Bin points
+    bin_sizes = defaultdict(int)
+    for point in X:
+        binned_point = np.round(point / bin_size)
+        bin_sizes[tuple(binned_point)] += 1
+
+    # Select only those bins as seeds which have enough members
+    bin_seeds = np.array(
+        [point for point, freq in bin_sizes.items() if freq >= min_bin_freq],
+        dtype=np.float32,
+    )
+    if len(bin_seeds) == len(X):
+        warnings.warn(
+            "Binning data failed with provided bin_size=%f, using data points as seeds."
+            % bin_size
+        )
+        return X
+    bin_seeds = bin_seeds * bin_size
+    return bin_seeds
+
+
+class MeanShift(ClusterMixin, BaseEstimator):
+    """Mean shift clustering using a flat kernel.
+
+    Mean shift clustering aims to discover "blobs" in a smooth density of
+    samples. It is a centroid-based algorithm, which works by updating
+    candidates for centroids to be the mean of the points within a given
+    region. These candidates are then filtered in a post-processing stage to
+    eliminate near-duplicates to form the final set of centroids.
+
+    Seeding is performed using a binning technique for scalability.
+
+    Read more in the :ref:`User Guide <mean_shift>`.
+
+    Parameters
+    ----------
+    bandwidth : float, default=None
+        Bandwidth used in the flat kernel.
+
+        If not given, the bandwidth is estimated using
+        sklearn.cluster.estimate_bandwidth; see the documentation for that
+        function for hints on scalability (see also the Notes, below).
+
+    seeds : array-like of shape (n_samples, n_features), default=None
+        Seeds used to initialize kernels. If not set,
+        the seeds are calculated by clustering.get_bin_seeds
+        with bandwidth as the grid size and default values for
+        other parameters.
+
+    bin_seeding : bool, default=False
+        If true, initial kernel locations are not locations of all
+        points, but rather the location of the discretized version of
+        points, where points are binned onto a grid whose coarseness
+        corresponds to the bandwidth. Setting this option to True will speed
+        up the algorithm because fewer seeds will be initialized.
+        The default value is False.
+        Ignored if seeds argument is not None.
+
+    min_bin_freq : int, default=1
+       To speed up the algorithm, accept only those bins with at least
+       min_bin_freq points as seeds.
+
+    cluster_all : bool, default=True
+        If true, then all points are clustered, even those orphans that are
+        not within any kernel. Orphans are assigned to the nearest kernel.
+        If false, then orphans are given cluster label -1.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. The following tasks benefit
+        from the parallelization:
+
+        - The search of nearest neighbors for bandwidth estimation and label
+          assignments. See the details in the docstring of the
+          ``NearestNeighbors`` class.
+        - Hill-climbing optimization for all seeds.
+
+        See :term:`Glossary <n_jobs>` for more details.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    max_iter : int, default=300
+        Maximum number of iterations, per seed point before the clustering
+        operation terminates (for that seed point), if has not converged yet.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
+        Coordinates of cluster centers.
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point.
+
+    n_iter_ : int
+        Maximum number of iterations performed on each seed.
+
+        .. versionadded:: 0.22
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    KMeans : K-Means clustering.
+
+    Notes
+    -----
+
+    Scalability:
+
+    Because this implementation uses a flat kernel and
+    a Ball Tree to look up members of each kernel, the complexity will tend
+    towards O(T*n*log(n)) in lower dimensions, with n the number of samples
+    and T the number of points. In higher dimensions the complexity will
+    tend towards O(T*n^2).
+
+    Scalability can be boosted by using fewer seeds, for example by using
+    a higher value of min_bin_freq in the get_bin_seeds function.
+
+    Note that the estimate_bandwidth function is much less scalable than the
+    mean shift algorithm and will be the bottleneck if it is used.
+
+    References
+    ----------
+
+    Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward
+    feature space analysis". IEEE Transactions on Pattern Analysis and
+    Machine Intelligence. 2002. pp. 603-619.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import MeanShift
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> clustering = MeanShift(bandwidth=2).fit(X)
+    >>> clustering.labels_
+    array([1, 1, 1, 0, 0, 0])
+    >>> clustering.predict([[0, 0], [5, 5]])
+    array([1, 0])
+    >>> clustering
+    MeanShift(bandwidth=2)
+    """
+
+    _parameter_constraints: dict = {
+        "bandwidth": [Interval(Real, 0, None, closed="neither"), None],
+        "seeds": ["array-like", None],
+        "bin_seeding": ["boolean"],
+        "min_bin_freq": [Interval(Integral, 1, None, closed="left")],
+        "cluster_all": ["boolean"],
+        "n_jobs": [Integral, None],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        *,
+        bandwidth=None,
+        seeds=None,
+        bin_seeding=False,
+        min_bin_freq=1,
+        cluster_all=True,
+        n_jobs=None,
+        max_iter=300,
+    ):
+        self.bandwidth = bandwidth
+        self.seeds = seeds
+        self.bin_seeding = bin_seeding
+        self.cluster_all = cluster_all
+        self.min_bin_freq = min_bin_freq
+        self.n_jobs = n_jobs
+        self.max_iter = max_iter
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Perform clustering.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples to cluster.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+               Fitted instance.
+        """
+        X = self._validate_data(X)
+        bandwidth = self.bandwidth
+        if bandwidth is None:
+            bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
+
+        seeds = self.seeds
+        if seeds is None:
+            if self.bin_seeding:
+                seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)
+            else:
+                seeds = X
+        n_samples, n_features = X.shape
+        center_intensity_dict = {}
+
+        # We use n_jobs=1 because this will be used in nested calls under
+        # parallel calls to _mean_shift_single_seed so there is no need for
+        # for further parallelism.
+        nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
+
+        # execute iterations on all seeds in parallel
+        all_res = Parallel(n_jobs=self.n_jobs)(
+            delayed(_mean_shift_single_seed)(seed, X, nbrs, self.max_iter)
+            for seed in seeds
+        )
+        # copy results in a dictionary
+        for i in range(len(seeds)):
+            if all_res[i][1]:  # i.e. len(points_within) > 0
+                center_intensity_dict[all_res[i][0]] = all_res[i][1]
+
+        self.n_iter_ = max([x[2] for x in all_res])
+
+        if not center_intensity_dict:
+            # nothing near seeds
+            raise ValueError(
+                "No point was within bandwidth=%f of any seed. Try a different seeding"
+                " strategy                              or increase the bandwidth."
+                % bandwidth
+            )
+
+        # POST PROCESSING: remove near duplicate points
+        # If the distance between two kernels is less than the bandwidth,
+        # then we have to remove one because it is a duplicate. Remove the
+        # one with fewer points.
+
+        sorted_by_intensity = sorted(
+            center_intensity_dict.items(),
+            key=lambda tup: (tup[1], tup[0]),
+            reverse=True,
+        )
+        sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
+        unique = np.ones(len(sorted_centers), dtype=bool)
+        nbrs = NearestNeighbors(radius=bandwidth, n_jobs=self.n_jobs).fit(
+            sorted_centers
+        )
+        for i, center in enumerate(sorted_centers):
+            if unique[i]:
+                neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[
+                    0
+                ]
+                unique[neighbor_idxs] = 0
+                unique[i] = 1  # leave the current point as unique
+        cluster_centers = sorted_centers[unique]
+
+        # ASSIGN LABELS: a point belongs to the cluster that it is closest to
+        nbrs = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs).fit(cluster_centers)
+        labels = np.zeros(n_samples, dtype=int)
+        distances, idxs = nbrs.kneighbors(X)
+        if self.cluster_all:
+            labels = idxs.flatten()
+        else:
+            labels.fill(-1)
+            bool_selector = distances.flatten() <= bandwidth
+            labels[bool_selector] = idxs.flatten()[bool_selector]
+
+        self.cluster_centers_, self.labels_ = cluster_centers, labels
+        return self
+
+    def predict(self, X):
+        """Predict the closest cluster each sample in X belongs to.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            New data to predict.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        check_is_fitted(self)
+        X = self._validate_data(X, reset=False)
+        with config_context(assume_finite=True):
+            return pairwise_distances_argmin(X, self.cluster_centers_)
@@ -0,0 +1,801 @@
+"""Algorithms for spectral clustering"""
+
+# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
+#         Brian Cheung
+#         Wei LI <kuantkid@gmail.com>
+#         Andrew Knyazev <Andrew.Knyazev@ucdenver.edu>
+# License: BSD 3 clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy.linalg import LinAlgError, qr, svd
+from scipy.sparse import csc_matrix
+
+from ..base import BaseEstimator, ClusterMixin, _fit_context
+from ..manifold._spectral_embedding import _spectral_embedding
+from ..metrics.pairwise import KERNEL_PARAMS, pairwise_kernels
+from ..neighbors import NearestNeighbors, kneighbors_graph
+from ..utils import as_float_array, check_random_state
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ._kmeans import k_means
+
+
+def cluster_qr(vectors):
+    """Find the discrete partition closest to the eigenvector embedding.
+
+        This implementation was proposed in [1]_.
+
+    .. versionadded:: 1.1
+
+        Parameters
+        ----------
+        vectors : array-like, shape: (n_samples, n_clusters)
+            The embedding space of the samples.
+
+        Returns
+        -------
+        labels : array of integers, shape: n_samples
+            The cluster labels of vectors.
+
+        References
+        ----------
+        .. [1] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
+            Anil Damle, Victor Minden, Lexing Ying
+            <10.1093/imaiai/iay008>`
+
+    """
+
+    k = vectors.shape[1]
+    _, _, piv = qr(vectors.T, pivoting=True)
+    ut, _, v = svd(vectors[piv[:k], :].T)
+    vectors = abs(np.dot(vectors, np.dot(ut, v.conj())))
+    return vectors.argmax(axis=1)
+
+
+def discretize(
+    vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None
+):
+    """Search for a partition matrix which is closest to the eigenvector embedding.
+
+    This implementation was proposed in [1]_.
+
+    Parameters
+    ----------
+    vectors : array-like of shape (n_samples, n_clusters)
+        The embedding space of the samples.
+
+    copy : bool, default=True
+        Whether to copy vectors, or perform in-place normalization.
+
+    max_svd_restarts : int, default=30
+        Maximum number of attempts to restart SVD if convergence fails
+
+    n_iter_max : int, default=30
+        Maximum number of iterations to attempt in rotation and partition
+        matrix search if machine precision convergence is not reached
+
+    random_state : int, RandomState instance, default=None
+        Determines random number generation for rotation matrix initialization.
+        Use an int to make the randomness deterministic.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    labels : array of integers, shape: n_samples
+        The labels of the clusters.
+
+    References
+    ----------
+
+    .. [1] `Multiclass spectral clustering, 2003
+           Stella X. Yu, Jianbo Shi
+           <https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
+
+    Notes
+    -----
+
+    The eigenvector embedding is used to iteratively search for the
+    closest discrete partition.  First, the eigenvector embedding is
+    normalized to the space of partition matrices. An optimal discrete
+    partition matrix closest to this normalized embedding multiplied by
+    an initial rotation is calculated.  Fixing this discrete partition
+    matrix, an optimal rotation matrix is calculated.  These two
+    calculations are performed until convergence.  The discrete partition
+    matrix is returned as the clustering solution.  Used in spectral
+    clustering, this method tends to be faster and more robust to random
+    initialization than k-means.
+
+    """
+
+    random_state = check_random_state(random_state)
+
+    vectors = as_float_array(vectors, copy=copy)
+
+    eps = np.finfo(float).eps
+    n_samples, n_components = vectors.shape
+
+    # Normalize the eigenvectors to an equal length of a vector of ones.
+    # Reorient the eigenvectors to point in the negative direction with respect
+    # to the first element.  This may have to do with constraining the
+    # eigenvectors to lie in a specific quadrant to make the discretization
+    # search easier.
+    norm_ones = np.sqrt(n_samples)
+    for i in range(vectors.shape[1]):
+        vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) * norm_ones
+        if vectors[0, i] != 0:
+            vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])
+
+    # Normalize the rows of the eigenvectors.  Samples should lie on the unit
+    # hypersphere centered at the origin.  This transforms the samples in the
+    # embedding space to the space of partition matrices.
+    vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis]
+
+    svd_restarts = 0
+    has_converged = False
+
+    # If there is an exception we try to randomize and rerun SVD again
+    # do this max_svd_restarts times.
+    while (svd_restarts < max_svd_restarts) and not has_converged:
+        # Initialize first column of rotation matrix with a row of the
+        # eigenvectors
+        rotation = np.zeros((n_components, n_components))
+        rotation[:, 0] = vectors[random_state.randint(n_samples), :].T
+
+        # To initialize the rest of the rotation matrix, find the rows
+        # of the eigenvectors that are as orthogonal to each other as
+        # possible
+        c = np.zeros(n_samples)
+        for j in range(1, n_components):
+            # Accumulate c to ensure row is as orthogonal as possible to
+            # previous picks as well as current one
+            c += np.abs(np.dot(vectors, rotation[:, j - 1]))
+            rotation[:, j] = vectors[c.argmin(), :].T
+
+        last_objective_value = 0.0
+        n_iter = 0
+
+        while not has_converged:
+            n_iter += 1
+
+            t_discrete = np.dot(vectors, rotation)
+
+            labels = t_discrete.argmax(axis=1)
+            vectors_discrete = csc_matrix(
+                (np.ones(len(labels)), (np.arange(0, n_samples), labels)),
+                shape=(n_samples, n_components),
+            )
+
+            t_svd = vectors_discrete.T * vectors
+
+            try:
+                U, S, Vh = np.linalg.svd(t_svd)
+            except LinAlgError:
+                svd_restarts += 1
+                print("SVD did not converge, randomizing and trying again")
+                break
+
+            ncut_value = 2.0 * (n_samples - S.sum())
+            if (abs(ncut_value - last_objective_value) < eps) or (n_iter > n_iter_max):
+                has_converged = True
+            else:
+                # otherwise calculate rotation and continue
+                last_objective_value = ncut_value
+                rotation = np.dot(Vh.T, U.T)
+
+    if not has_converged:
+        raise LinAlgError("SVD did not converge")
+    return labels
+
+
+@validate_params(
+    {"affinity": ["array-like", "sparse matrix"]},
+    prefer_skip_nested_validation=False,
+)
+def spectral_clustering(
+    affinity,
+    *,
+    n_clusters=8,
+    n_components=None,
+    eigen_solver=None,
+    random_state=None,
+    n_init=10,
+    eigen_tol="auto",
+    assign_labels="kmeans",
+    verbose=False,
+):
+    """Apply clustering to a projection of the normalized Laplacian.
+
+    In practice Spectral Clustering is very useful when the structure of
+    the individual clusters is highly non-convex or more generally when
+    a measure of the center and spread of the cluster is not a suitable
+    description of the complete cluster. For instance, when clusters are
+    nested circles on the 2D plane.
+
+    If affinity is the adjacency matrix of a graph, this method can be
+    used to find normalized graph cuts [1]_, [2]_.
+
+    Read more in the :ref:`User Guide <spectral_clustering>`.
+
+    Parameters
+    ----------
+    affinity : {array-like, sparse matrix} of shape (n_samples, n_samples)
+        The affinity matrix describing the relationship of the samples to
+        embed. **Must be symmetric**.
+
+        Possible examples:
+          - adjacency matrix of a graph,
+          - heat kernel of the pairwise distance matrix of the samples,
+          - symmetric k-nearest neighbours connectivity matrix of the samples.
+
+    n_clusters : int, default=None
+        Number of clusters to extract.
+
+    n_components : int, default=n_clusters
+        Number of eigenvectors to use for the spectral embedding.
+
+    eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
+        The eigenvalue decomposition method. If None then ``'arpack'`` is used.
+        See [4]_ for more details regarding ``'lobpcg'``.
+        Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional
+        Algebraic MultiGrid preconditioning and requires pyamg to be installed.
+        It can be faster on very large sparse problems [6]_ and [7]_.
+
+    random_state : int, RandomState instance, default=None
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigenvectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
+
+    n_init : int, default=10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of n_init
+        consecutive runs in terms of inertia. Only used if
+        ``assign_labels='kmeans'``.
+
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigendecomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
+        values of `tol<1e-5` may lead to convergence issues and should be
+        avoided.
+
+        .. versionadded:: 1.2
+           Added 'auto' option.
+
+    assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
+        The strategy to use to assign labels in the embedding
+        space.  There are three ways to assign labels after the Laplacian
+        embedding.  k-means can be applied and is a popular choice. But it can
+        also be sensitive to initialization. Discretization is another
+        approach which is less sensitive to random initialization [3]_.
+        The cluster_qr method [5]_ directly extracts clusters from eigenvectors
+        in spectral clustering. In contrast to k-means and discretization, cluster_qr
+        has no tuning parameters and is not an iterative method, yet may outperform
+        k-means and discretization in terms of both quality and speed.
+
+        .. versionchanged:: 1.1
+           Added new labeling method 'cluster_qr'.
+
+    verbose : bool, default=False
+        Verbosity mode.
+
+        .. versionadded:: 0.24
+
+    Returns
+    -------
+    labels : array of integers, shape: n_samples
+        The labels of the clusters.
+
+    Notes
+    -----
+    The graph should contain only one connected component, elsewhere
+    the results make little sense.
+
+    This algorithm solves the normalized cut for `k=2`: it is a
+    normalized spectral clustering.
+
+    References
+    ----------
+
+    .. [1] :doi:`Normalized cuts and image segmentation, 2000
+           Jianbo Shi, Jitendra Malik
+           <10.1109/34.868688>`
+
+    .. [2] :doi:`A Tutorial on Spectral Clustering, 2007
+           Ulrike von Luxburg
+           <10.1007/s11222-007-9033-z>`
+
+    .. [3] `Multiclass spectral clustering, 2003
+           Stella X. Yu, Jianbo Shi
+           <https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
+
+    .. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
+           Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
+           A. V. Knyazev
+           SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
+           <10.1137/S1064827500366124>`
+
+    .. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
+           Anil Damle, Victor Minden, Lexing Ying
+           <10.1093/imaiai/iay008>`
+
+    .. [6] :doi:`Multiscale Spectral Image Segmentation Multiscale preconditioning
+           for computing eigenvalues of graph Laplacians in image segmentation, 2006
+           Andrew Knyazev
+           <10.13140/RG.2.2.35280.02565>`
+
+    .. [7] :doi:`Preconditioned spectral clustering for stochastic block partition
+           streaming graph challenge (Preliminary version at arXiv.)
+           David Zhuzhunashvili, Andrew Knyazev
+           <10.1109/HPEC.2017.8091045>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics.pairwise import pairwise_kernels
+    >>> from sklearn.cluster import spectral_clustering
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> affinity = pairwise_kernels(X, metric='rbf')
+    >>> spectral_clustering(
+    ...     affinity=affinity, n_clusters=2, assign_labels="discretize", random_state=0
+    ... )
+    array([1, 1, 1, 0, 0, 0])
+    """
+
+    clusterer = SpectralClustering(
+        n_clusters=n_clusters,
+        n_components=n_components,
+        eigen_solver=eigen_solver,
+        random_state=random_state,
+        n_init=n_init,
+        affinity="precomputed",
+        eigen_tol=eigen_tol,
+        assign_labels=assign_labels,
+        verbose=verbose,
+    ).fit(affinity)
+
+    return clusterer.labels_
+
+
+class SpectralClustering(ClusterMixin, BaseEstimator):
+    """Apply clustering to a projection of the normalized Laplacian.
+
+    In practice Spectral Clustering is very useful when the structure of
+    the individual clusters is highly non-convex, or more generally when
+    a measure of the center and spread of the cluster is not a suitable
+    description of the complete cluster, such as when clusters are
+    nested circles on the 2D plane.
+
+    If the affinity matrix is the adjacency matrix of a graph, this method
+    can be used to find normalized graph cuts [1]_, [2]_.
+
+    When calling ``fit``, an affinity matrix is constructed using either
+    a kernel function such the Gaussian (aka RBF) kernel with Euclidean
+    distance ``d(X, X)``::
+
+            np.exp(-gamma * d(X,X) ** 2)
+
+    or a k-nearest neighbors connectivity matrix.
+
+    Alternatively, a user-provided affinity matrix can be specified by
+    setting ``affinity='precomputed'``.
+
+    Read more in the :ref:`User Guide <spectral_clustering>`.
+
+    Parameters
+    ----------
+    n_clusters : int, default=8
+        The dimension of the projection subspace.
+
+    eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
+        The eigenvalue decomposition strategy to use. AMG requires pyamg
+        to be installed. It can be faster on very large, sparse problems,
+        but may also lead to instabilities. If None, then ``'arpack'`` is
+        used. See [4]_ for more details regarding `'lobpcg'`.
+
+    n_components : int, default=None
+        Number of eigenvectors to use for the spectral embedding. If None,
+        defaults to `n_clusters`.
+
+    random_state : int, RandomState instance, default=None
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigenvectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
+
+    n_init : int, default=10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of n_init
+        consecutive runs in terms of inertia. Only used if
+        ``assign_labels='kmeans'``.
+
+    gamma : float, default=1.0
+        Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
+        Ignored for ``affinity='nearest_neighbors'``, ``affinity='precomputed'``
+        or ``affinity='precomputed_nearest_neighbors'``.
+
+    affinity : str or callable, default='rbf'
+        How to construct the affinity matrix.
+         - 'nearest_neighbors': construct the affinity matrix by computing a
+           graph of nearest neighbors.
+         - 'rbf': construct the affinity matrix using a radial basis function
+           (RBF) kernel.
+         - 'precomputed': interpret ``X`` as a precomputed affinity matrix,
+           where larger values indicate greater similarity between instances.
+         - 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph
+           of precomputed distances, and construct a binary affinity matrix
+           from the ``n_neighbors`` nearest neighbors of each instance.
+         - one of the kernels supported by
+           :func:`~sklearn.metrics.pairwise.pairwise_kernels`.
+
+        Only kernels that produce similarity scores (non-negative values that
+        increase with similarity) should be used. This property is not checked
+        by the clustering algorithm.
+
+    n_neighbors : int, default=10
+        Number of neighbors to use when constructing the affinity matrix using
+        the nearest neighbors method. Ignored for ``affinity='rbf'``.
+
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigen decomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
+        values of `tol<1e-5` may lead to convergence issues and should be
+        avoided.
+
+        .. versionadded:: 1.2
+           Added 'auto' option.
+
+    assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
+        The strategy for assigning labels in the embedding space. There are two
+        ways to assign labels after the Laplacian embedding. k-means is a
+        popular choice, but it can be sensitive to initialization.
+        Discretization is another approach which is less sensitive to random
+        initialization [3]_.
+        The cluster_qr method [5]_ directly extract clusters from eigenvectors
+        in spectral clustering. In contrast to k-means and discretization, cluster_qr
+        has no tuning parameters and runs no iterations, yet may outperform
+        k-means and discretization in terms of both quality and speed.
+
+        .. versionchanged:: 1.1
+           Added new labeling method 'cluster_qr'.
+
+    degree : float, default=3
+        Degree of the polynomial kernel. Ignored by other kernels.
+
+    coef0 : float, default=1
+        Zero coefficient for polynomial and sigmoid kernels.
+        Ignored by other kernels.
+
+    kernel_params : dict of str to any, default=None
+        Parameters (keyword arguments) and values for kernel passed as
+        callable object. Ignored by other kernels.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run when `affinity='nearest_neighbors'`
+        or `affinity='precomputed_nearest_neighbors'`. The neighbors search
+        will be done in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : bool, default=False
+        Verbosity mode.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    affinity_matrix_ : array-like of shape (n_samples, n_samples)
+        Affinity matrix used for clustering. Available only after calling
+        ``fit``.
+
+    labels_ : ndarray of shape (n_samples,)
+        Labels of each point
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.cluster.KMeans : K-Means clustering.
+    sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of
+        Applications with Noise.
+
+    Notes
+    -----
+    A distance matrix for which 0 indicates identical elements and high values
+    indicate very dissimilar elements can be transformed into an affinity /
+    similarity matrix that is well-suited for the algorithm by
+    applying the Gaussian (aka RBF, heat) kernel::
+
+        np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
+
+    where ``delta`` is a free parameter representing the width of the Gaussian
+    kernel.
+
+    An alternative is to take a symmetric version of the k-nearest neighbors
+    connectivity matrix of the points.
+
+    If the pyamg package is installed, it is used: this greatly
+    speeds up computation.
+
+    References
+    ----------
+    .. [1] :doi:`Normalized cuts and image segmentation, 2000
+           Jianbo Shi, Jitendra Malik
+           <10.1109/34.868688>`
+
+    .. [2] :doi:`A Tutorial on Spectral Clustering, 2007
+           Ulrike von Luxburg
+           <10.1007/s11222-007-9033-z>`
+
+    .. [3] `Multiclass spectral clustering, 2003
+           Stella X. Yu, Jianbo Shi
+           <https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
+
+    .. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
+           Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
+           A. V. Knyazev
+           SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
+           <10.1137/S1064827500366124>`
+
+    .. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
+           Anil Damle, Victor Minden, Lexing Ying
+           <10.1093/imaiai/iay008>`
+
+    Examples
+    --------
+    >>> from sklearn.cluster import SpectralClustering
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> clustering = SpectralClustering(n_clusters=2,
+    ...         assign_labels='discretize',
+    ...         random_state=0).fit(X)
+    >>> clustering.labels_
+    array([1, 1, 1, 0, 0, 0])
+    >>> clustering
+    SpectralClustering(assign_labels='discretize', n_clusters=2,
+        random_state=0)
+    """
+
+    _parameter_constraints: dict = {
+        "n_clusters": [Interval(Integral, 1, None, closed="left")],
+        "eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
+        "n_components": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "gamma": [Interval(Real, 0, None, closed="left")],
+        "affinity": [
+            callable,
+            StrOptions(
+                set(KERNEL_PARAMS)
+                | {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"}
+            ),
+        ],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "eigen_tol": [
+            Interval(Real, 0.0, None, closed="left"),
+            StrOptions({"auto"}),
+        ],
+        "assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})],
+        "degree": [Interval(Real, 0, None, closed="left")],
+        "coef0": [Interval(Real, None, None, closed="neither")],
+        "kernel_params": [dict, None],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        n_clusters=8,
+        *,
+        eigen_solver=None,
+        n_components=None,
+        random_state=None,
+        n_init=10,
+        gamma=1.0,
+        affinity="rbf",
+        n_neighbors=10,
+        eigen_tol="auto",
+        assign_labels="kmeans",
+        degree=3,
+        coef0=1,
+        kernel_params=None,
+        n_jobs=None,
+        verbose=False,
+    ):
+        self.n_clusters = n_clusters
+        self.eigen_solver = eigen_solver
+        self.n_components = n_components
+        self.random_state = random_state
+        self.n_init = n_init
+        self.gamma = gamma
+        self.affinity = affinity
+        self.n_neighbors = n_neighbors
+        self.eigen_tol = eigen_tol
+        self.assign_labels = assign_labels
+        self.degree = degree
+        self.coef0 = coef0
+        self.kernel_params = kernel_params
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Perform spectral clustering from features, or affinity matrix.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Training instances to cluster, similarities / affinities between
+            instances if ``affinity='precomputed'``, or distances between
+            instances if ``affinity='precomputed_nearest_neighbors``. If a
+            sparse matrix is provided in a format other than ``csr_matrix``,
+            ``csc_matrix``, or ``coo_matrix``, it will be converted into a
+            sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            A fitted instance of the estimator.
+        """
+        X = self._validate_data(
+            X,
+            accept_sparse=["csr", "csc", "coo"],
+            dtype=np.float64,
+            ensure_min_samples=2,
+        )
+        allow_squared = self.affinity in [
+            "precomputed",
+            "precomputed_nearest_neighbors",
+        ]
+        if X.shape[0] == X.shape[1] and not allow_squared:
+            warnings.warn(
+                "The spectral clustering API has changed. ``fit``"
+                "now constructs an affinity matrix from data. To use"
+                " a custom affinity matrix, "
+                "set ``affinity=precomputed``."
+            )
+
+        if self.affinity == "nearest_neighbors":
+            connectivity = kneighbors_graph(
+                X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs
+            )
+            self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
+        elif self.affinity == "precomputed_nearest_neighbors":
+            estimator = NearestNeighbors(
+                n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
+            ).fit(X)
+            connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
+            self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
+        elif self.affinity == "precomputed":
+            self.affinity_matrix_ = X
+        else:
+            params = self.kernel_params
+            if params is None:
+                params = {}
+            if not callable(self.affinity):
+                params["gamma"] = self.gamma
+                params["degree"] = self.degree
+                params["coef0"] = self.coef0
+            self.affinity_matrix_ = pairwise_kernels(
+                X, metric=self.affinity, filter_params=True, **params
+            )
+
+        random_state = check_random_state(self.random_state)
+        n_components = (
+            self.n_clusters if self.n_components is None else self.n_components
+        )
+        # We now obtain the real valued solution matrix to the
+        # relaxed Ncut problem, solving the eigenvalue problem
+        # L_sym x = lambda x  and recovering u = D^-1/2 x.
+        # The first eigenvector is constant only for fully connected graphs
+        # and should be kept for spectral clustering (drop_first = False)
+        # See spectral_embedding documentation.
+        maps = _spectral_embedding(
+            self.affinity_matrix_,
+            n_components=n_components,
+            eigen_solver=self.eigen_solver,
+            random_state=random_state,
+            eigen_tol=self.eigen_tol,
+            drop_first=False,
+        )
+        if self.verbose:
+            print(f"Computing label assignment using {self.assign_labels}")
+
+        if self.assign_labels == "kmeans":
+            _, self.labels_, _ = k_means(
+                maps,
+                self.n_clusters,
+                random_state=random_state,
+                n_init=self.n_init,
+                verbose=self.verbose,
+            )
+        elif self.assign_labels == "cluster_qr":
+            self.labels_ = cluster_qr(maps)
+        else:
+            self.labels_ = discretize(maps, random_state=random_state)
+
+        return self
+
+    def fit_predict(self, X, y=None):
+        """Perform spectral clustering on `X` and return cluster labels.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Training instances to cluster, similarities / affinities between
+            instances if ``affinity='precomputed'``, or distances between
+            instances if ``affinity='precomputed_nearest_neighbors``. If a
+            sparse matrix is provided in a format other than ``csr_matrix``,
+            ``csc_matrix``, or ``coo_matrix``, it will be converted into a
+            sparse ``csr_matrix``.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        labels : ndarray of shape (n_samples,)
+            Cluster labels.
+        """
+        return super().fit_predict(X, y)
+
+    def _more_tags(self):
+        return {
+            "pairwise": self.affinity
+            in [
+                "precomputed",
+                "precomputed_nearest_neighbors",
+            ]
+        }
@@ -0,0 +1,29 @@
+cluster_extension_metadata = {
+  '_dbscan_inner':
+    {'sources': ['_dbscan_inner.pyx'], 'override_options': ['cython_language=cpp']},
+  '_hierarchical_fast':
+    {'sources': ['_hierarchical_fast.pyx', metrics_cython_tree],
+     'override_options': ['cython_language=cpp']},
+  '_k_means_common':
+    {'sources': ['_k_means_common.pyx']},
+  '_k_means_lloyd':
+    {'sources': ['_k_means_lloyd.pyx']},
+  '_k_means_elkan':
+    {'sources': ['_k_means_elkan.pyx']},
+  '_k_means_minibatch':
+    {'sources': ['_k_means_minibatch.pyx']},
+}
+
+foreach ext_name, ext_dict : cluster_extension_metadata
+  py.extension_module(
+    ext_name,
+    [ext_dict.get('sources'), utils_cython_tree],
+    dependencies: [np_dep, openmp_dep],
+    override_options : ext_dict.get('override_options', []),
+    cython_args: cython_args,
+    subdir: 'sklearn/cluster',
+    install: true
+  )
+endforeach
+
+subdir('_hdbscan')
@@ -0,0 +1,37 @@
+"""
+Common utilities for testing clustering.
+
+"""
+
+import numpy as np
+
+###############################################################################
+# Generate sample data
+
+
+def generate_clustered_data(
+    seed=0, n_clusters=3, n_features=2, n_samples_per_cluster=20, std=0.4
+):
+    prng = np.random.RandomState(seed)
+
+    # the data is voluntary shifted away from zero to check clustering
+    # algorithm robustness with regards to non centered data
+    means = (
+        np.array(
+            [
+                [1, 1, 1, 0],
+                [-1, -1, 0, 1],
+                [1, -1, 1, 1],
+                [-1, 1, 1, 0],
+            ]
+        )
+        + 10
+    )
+
+    X = np.empty((0, n_features))
+    for i in range(n_clusters):
+        X = np.r_[
+            X,
+            means[i][:n_features] + std * prng.randn(n_samples_per_cluster, n_features),
+        ]
+    return X
@@ -0,0 +1,321 @@
+"""
+Testing for Clustering methods
+
+"""
+
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.cluster import AffinityPropagation, affinity_propagation
+from sklearn.cluster._affinity_propagation import _equal_similarities_and_preferences
+from sklearn.datasets import make_blobs
+from sklearn.exceptions import ConvergenceWarning, NotFittedError
+from sklearn.metrics import euclidean_distances
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+n_clusters = 3
+centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
+X, _ = make_blobs(
+    n_samples=60,
+    n_features=2,
+    centers=centers,
+    cluster_std=0.4,
+    shuffle=True,
+    random_state=0,
+)
+
+# TODO: AffinityPropagation must preserve dtype for its fitted attributes
+# and test must be created accordingly to this new behavior.
+# For more details, see: https://github.com/scikit-learn/scikit-learn/issues/11000
+
+
+def test_affinity_propagation(global_random_seed, global_dtype):
+    """Test consistency of the affinity propagations."""
+    S = -euclidean_distances(X.astype(global_dtype, copy=False), squared=True)
+    preference = np.median(S) * 10
+    cluster_centers_indices, labels = affinity_propagation(
+        S, preference=preference, random_state=global_random_seed
+    )
+
+    n_clusters_ = len(cluster_centers_indices)
+
+    assert n_clusters == n_clusters_
+
+
+def test_affinity_propagation_precomputed():
+    """Check equality of precomputed affinity matrix to internally computed affinity
+    matrix.
+    """
+    S = -euclidean_distances(X, squared=True)
+    preference = np.median(S) * 10
+    af = AffinityPropagation(
+        preference=preference, affinity="precomputed", random_state=28
+    )
+    labels_precomputed = af.fit(S).labels_
+
+    af = AffinityPropagation(preference=preference, verbose=True, random_state=37)
+    labels = af.fit(X).labels_
+
+    assert_array_equal(labels, labels_precomputed)
+
+    cluster_centers_indices = af.cluster_centers_indices_
+
+    n_clusters_ = len(cluster_centers_indices)
+    assert np.unique(labels).size == n_clusters_
+    assert n_clusters == n_clusters_
+
+
+def test_affinity_propagation_no_copy():
+    """Check behaviour of not copying the input data."""
+    S = -euclidean_distances(X, squared=True)
+    S_original = S.copy()
+    preference = np.median(S) * 10
+    assert not np.allclose(S.diagonal(), preference)
+
+    # with copy=True S should not be modified
+    affinity_propagation(S, preference=preference, copy=True, random_state=0)
+    assert_allclose(S, S_original)
+    assert not np.allclose(S.diagonal(), preference)
+    assert_allclose(S.diagonal(), np.zeros(S.shape[0]))
+
+    # with copy=False S will be modified inplace
+    affinity_propagation(S, preference=preference, copy=False, random_state=0)
+    assert_allclose(S.diagonal(), preference)
+
+    # test that copy=True and copy=False lead to the same result
+    S = S_original.copy()
+    af = AffinityPropagation(preference=preference, verbose=True, random_state=0)
+
+    labels = af.fit(X).labels_
+    _, labels_no_copy = affinity_propagation(
+        S, preference=preference, copy=False, random_state=74
+    )
+    assert_array_equal(labels, labels_no_copy)
+
+
+def test_affinity_propagation_affinity_shape():
+    """Check the shape of the affinity matrix when using `affinity_propagation."""
+    S = -euclidean_distances(X, squared=True)
+    err_msg = "The matrix of similarities must be a square array"
+    with pytest.raises(ValueError, match=err_msg):
+        affinity_propagation(S[:, :-1])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_affinity_propagation_precomputed_with_sparse_input(csr_container):
+    err_msg = "Sparse data was passed for X, but dense data is required"
+    with pytest.raises(TypeError, match=err_msg):
+        AffinityPropagation(affinity="precomputed").fit(csr_container((3, 3)))
+
+
+def test_affinity_propagation_predict(global_random_seed, global_dtype):
+    # Test AffinityPropagation.predict
+    af = AffinityPropagation(affinity="euclidean", random_state=global_random_seed)
+    X_ = X.astype(global_dtype, copy=False)
+    labels = af.fit_predict(X_)
+    labels2 = af.predict(X_)
+    assert_array_equal(labels, labels2)
+
+
+def test_affinity_propagation_predict_error():
+    # Test exception in AffinityPropagation.predict
+    # Not fitted.
+    af = AffinityPropagation(affinity="euclidean")
+    with pytest.raises(NotFittedError):
+        af.predict(X)
+
+    # Predict not supported when affinity="precomputed".
+    S = np.dot(X, X.T)
+    af = AffinityPropagation(affinity="precomputed", random_state=57)
+    af.fit(S)
+    with pytest.raises(ValueError, match="expecting 60 features as input"):
+        af.predict(X)
+
+
+def test_affinity_propagation_fit_non_convergence(global_dtype):
+    # In case of non-convergence of affinity_propagation(), the cluster
+    # centers should be an empty array and training samples should be labelled
+    # as noise (-1)
+    X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
+
+    # Force non-convergence by allowing only a single iteration
+    af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
+
+    with pytest.warns(ConvergenceWarning):
+        af.fit(X)
+    assert_allclose(np.empty((0, 2)), af.cluster_centers_)
+    assert_array_equal(np.array([-1, -1, -1]), af.labels_)
+
+
+def test_affinity_propagation_equal_mutual_similarities(global_dtype):
+    X = np.array([[-1, 1], [1, -1]], dtype=global_dtype)
+    S = -euclidean_distances(X, squared=True)
+
+    # setting preference > similarity
+    with pytest.warns(UserWarning, match="mutually equal"):
+        cluster_center_indices, labels = affinity_propagation(S, preference=0)
+
+    # expect every sample to become an exemplar
+    assert_array_equal([0, 1], cluster_center_indices)
+    assert_array_equal([0, 1], labels)
+
+    # setting preference < similarity
+    with pytest.warns(UserWarning, match="mutually equal"):
+        cluster_center_indices, labels = affinity_propagation(S, preference=-10)
+
+    # expect one cluster, with arbitrary (first) sample as exemplar
+    assert_array_equal([0], cluster_center_indices)
+    assert_array_equal([0, 0], labels)
+
+    # setting different preferences
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        cluster_center_indices, labels = affinity_propagation(
+            S, preference=[-20, -10], random_state=37
+        )
+
+    # expect one cluster, with highest-preference sample as exemplar
+    assert_array_equal([1], cluster_center_indices)
+    assert_array_equal([0, 0], labels)
+
+
+def test_affinity_propagation_predict_non_convergence(global_dtype):
+    # In case of non-convergence of affinity_propagation(), the cluster
+    # centers should be an empty array
+    X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
+
+    # Force non-convergence by allowing only a single iteration
+    with pytest.warns(ConvergenceWarning):
+        af = AffinityPropagation(preference=-10, max_iter=1, random_state=75).fit(X)
+
+    # At prediction time, consider new samples as noise since there are no
+    # clusters
+    to_predict = np.array([[2, 2], [3, 3], [4, 4]])
+    with pytest.warns(ConvergenceWarning):
+        y = af.predict(to_predict)
+    assert_array_equal(np.array([-1, -1, -1]), y)
+
+
+def test_affinity_propagation_non_convergence_regressiontest(global_dtype):
+    X = np.array(
+        [[1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 0, 0, 1]], dtype=global_dtype
+    )
+    af = AffinityPropagation(affinity="euclidean", max_iter=2, random_state=34)
+    msg = (
+        "Affinity propagation did not converge, this model may return degenerate"
+        " cluster centers and labels."
+    )
+    with pytest.warns(ConvergenceWarning, match=msg):
+        af.fit(X)
+
+    assert_array_equal(np.array([0, 0, 0]), af.labels_)
+
+
+def test_equal_similarities_and_preferences(global_dtype):
+    # Unequal distances
+    X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
+    S = -euclidean_distances(X, squared=True)
+
+    assert not _equal_similarities_and_preferences(S, np.array(0))
+    assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
+    assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
+
+    # Equal distances
+    X = np.array([[0, 0], [1, 1]], dtype=global_dtype)
+    S = -euclidean_distances(X, squared=True)
+
+    # Different preferences
+    assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
+
+    # Same preferences
+    assert _equal_similarities_and_preferences(S, np.array([0, 0]))
+    assert _equal_similarities_and_preferences(S, np.array(0))
+
+
+def test_affinity_propagation_random_state():
+    """Check that different random states lead to different initialisations
+    by looking at the center locations after two iterations.
+    """
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=300, centers=centers, cluster_std=0.5, random_state=0
+    )
+    # random_state = 0
+    ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
+    ap.fit(X)
+    centers0 = ap.cluster_centers_
+
+    # random_state = 76
+    ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
+    ap.fit(X)
+    centers76 = ap.cluster_centers_
+    # check that the centers have not yet converged to the same solution
+    assert np.mean((centers0 - centers76) ** 2) > 1
+
+
+@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array])
+def test_affinity_propagation_convergence_warning_dense_sparse(container, global_dtype):
+    """
+    Check that having sparse or dense `centers` format should not
+    influence the convergence.
+    Non-regression test for gh-13334.
+    """
+    centers = container(np.zeros((1, 10)))
+    rng = np.random.RandomState(42)
+    X = rng.rand(40, 10).astype(global_dtype, copy=False)
+    y = (4 * rng.rand(40)).astype(int)
+    ap = AffinityPropagation(random_state=46)
+    ap.fit(X, y)
+    ap.cluster_centers_ = centers
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", ConvergenceWarning)
+        assert_array_equal(ap.predict(X), np.zeros(X.shape[0], dtype=int))
+
+
+# FIXME; this test is broken with different random states, needs to be revisited
+def test_correct_clusters(global_dtype):
+    # Test to fix incorrect clusters due to dtype change
+    # (non-regression test for issue #10832)
+    X = np.array(
+        [[1, 0, 0, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], dtype=global_dtype
+    )
+    afp = AffinityPropagation(preference=1, affinity="precomputed", random_state=0).fit(
+        X
+    )
+    expected = np.array([0, 1, 1, 2])
+    assert_array_equal(afp.labels_, expected)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_input_for_predict(csr_container):
+    # Test to make sure sparse inputs are accepted for predict
+    # (non-regression test for issue #20049)
+    af = AffinityPropagation(affinity="euclidean", random_state=42)
+    af.fit(X)
+    labels = af.predict(csr_container((2, 2)))
+    assert_array_equal(labels, (2, 2))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_input_for_fit_predict(csr_container):
+    # Test to make sure sparse inputs are accepted for fit_predict
+    # (non-regression test for issue #20049)
+    af = AffinityPropagation(affinity="euclidean", random_state=42)
+    rng = np.random.RandomState(42)
+    X = csr_container(rng.randint(0, 2, size=(5, 5)))
+    labels = af.fit_predict(X)
+    assert_array_equal(labels, (0, 1, 1, 2, 3))
+
+
+def test_affinity_propagation_equal_points():
+    """Make sure we do not assign multiple clusters to equal points.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/20043
+    """
+    X = np.zeros((8, 1))
+    af = AffinityPropagation(affinity="euclidean", damping=0.5, random_state=42).fit(X)
+    assert np.all(af.labels_ == 0)
@@ -0,0 +1,264 @@
+"""Testing for Spectral Biclustering methods"""
+
+import numpy as np
+import pytest
+from scipy.sparse import issparse
+
+from sklearn.base import BaseEstimator, BiclusterMixin
+from sklearn.cluster import SpectralBiclustering, SpectralCoclustering
+from sklearn.cluster._bicluster import (
+    _bistochastic_normalize,
+    _log_normalize,
+    _scale_normalize,
+)
+from sklearn.datasets import make_biclusters, make_checkerboard
+from sklearn.metrics import consensus_score, v_measure_score
+from sklearn.model_selection import ParameterGrid
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+class MockBiclustering(BiclusterMixin, BaseEstimator):
+    # Mock object for testing get_submatrix.
+    def __init__(self):
+        pass
+
+    def get_indices(self, i):
+        # Overridden to reproduce old get_submatrix test.
+        return (
+            np.where([True, True, False, False, True])[0],
+            np.where([False, False, True, True])[0],
+        )
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_get_submatrix(csr_container):
+    data = np.arange(20).reshape(5, 4)
+    model = MockBiclustering()
+
+    for X in (data, csr_container(data), data.tolist()):
+        submatrix = model.get_submatrix(0, X)
+        if issparse(submatrix):
+            submatrix = submatrix.toarray()
+        assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]])
+        submatrix[:] = -1
+        if issparse(X):
+            X = X.toarray()
+        assert np.all(X != -1)
+
+
+def _test_shape_indices(model):
+    # Test get_shape and get_indices on fitted model.
+    for i in range(model.n_clusters):
+        m, n = model.get_shape(i)
+        i_ind, j_ind = model.get_indices(i)
+        assert len(i_ind) == m
+        assert len(j_ind) == n
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_spectral_coclustering(global_random_seed, csr_container):
+    # Test Dhillon's Spectral CoClustering on a simple problem.
+    param_grid = {
+        "svd_method": ["randomized", "arpack"],
+        "n_svd_vecs": [None, 20],
+        "mini_batch": [False, True],
+        "init": ["k-means++"],
+        "n_init": [10],
+    }
+    S, rows, cols = make_biclusters(
+        (30, 30), 3, noise=0.1, random_state=global_random_seed
+    )
+    S -= S.min()  # needs to be nonnegative before making it sparse
+    S = np.where(S < 1, 0, S)  # threshold some values
+    for mat in (S, csr_container(S)):
+        for kwargs in ParameterGrid(param_grid):
+            model = SpectralCoclustering(
+                n_clusters=3, random_state=global_random_seed, **kwargs
+            )
+            model.fit(mat)
+
+            assert model.rows_.shape == (3, 30)
+            assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
+            assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
+            assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+            _test_shape_indices(model)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_spectral_biclustering(global_random_seed, csr_container):
+    # Test Kluger methods on a checkerboard dataset.
+    S, rows, cols = make_checkerboard(
+        (30, 30), 3, noise=0.5, random_state=global_random_seed
+    )
+
+    non_default_params = {
+        "method": ["scale", "log"],
+        "svd_method": ["arpack"],
+        "n_svd_vecs": [20],
+        "mini_batch": [True],
+    }
+
+    for mat in (S, csr_container(S)):
+        for param_name, param_values in non_default_params.items():
+            for param_value in param_values:
+                model = SpectralBiclustering(
+                    n_clusters=3,
+                    n_init=3,
+                    init="k-means++",
+                    random_state=global_random_seed,
+                )
+                model.set_params(**dict([(param_name, param_value)]))
+
+                if issparse(mat) and model.get_params().get("method") == "log":
+                    # cannot take log of sparse matrix
+                    with pytest.raises(ValueError):
+                        model.fit(mat)
+                    continue
+                else:
+                    model.fit(mat)
+
+                assert model.rows_.shape == (9, 30)
+                assert model.columns_.shape == (9, 30)
+                assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30))
+                assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30))
+                assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+                _test_shape_indices(model)
+
+
+def _do_scale_test(scaled):
+    """Check that rows sum to one constant, and columns to another."""
+    row_sum = scaled.sum(axis=1)
+    col_sum = scaled.sum(axis=0)
+    if issparse(scaled):
+        row_sum = np.asarray(row_sum).squeeze()
+        col_sum = np.asarray(col_sum).squeeze()
+    assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1)
+    assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1)
+
+
+def _do_bistochastic_test(scaled):
+    """Check that rows and columns sum to the same constant."""
+    _do_scale_test(scaled)
+    assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_scale_normalize(global_random_seed, csr_container):
+    generator = np.random.RandomState(global_random_seed)
+    X = generator.rand(100, 100)
+    for mat in (X, csr_container(X)):
+        scaled, _, _ = _scale_normalize(mat)
+        _do_scale_test(scaled)
+        if issparse(mat):
+            assert issparse(scaled)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_bistochastic_normalize(global_random_seed, csr_container):
+    generator = np.random.RandomState(global_random_seed)
+    X = generator.rand(100, 100)
+    for mat in (X, csr_container(X)):
+        scaled = _bistochastic_normalize(mat)
+        _do_bistochastic_test(scaled)
+        if issparse(mat):
+            assert issparse(scaled)
+
+
+def test_log_normalize(global_random_seed):
+    # adding any constant to a log-scaled matrix should make it
+    # bistochastic
+    generator = np.random.RandomState(global_random_seed)
+    mat = generator.rand(100, 100)
+    scaled = _log_normalize(mat) + 1
+    _do_bistochastic_test(scaled)
+
+
+def test_fit_best_piecewise(global_random_seed):
+    model = SpectralBiclustering(random_state=global_random_seed)
+    vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]])
+    best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
+    assert_array_equal(best, vectors[:2])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_project_and_cluster(global_random_seed, csr_container):
+    model = SpectralBiclustering(random_state=global_random_seed)
+    data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]])
+    vectors = np.array([[1, 0], [0, 1], [0, 0]])
+    for mat in (data, csr_container(data)):
+        labels = model._project_and_cluster(mat, vectors, n_clusters=2)
+        assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
+
+
+def test_perfect_checkerboard(global_random_seed):
+    # XXX Previously failed on build bot (not reproducible)
+    model = SpectralBiclustering(
+        3, svd_method="arpack", random_state=global_random_seed
+    )
+
+    S, rows, cols = make_checkerboard(
+        (30, 30), 3, noise=0, random_state=global_random_seed
+    )
+    model.fit(S)
+    assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+    S, rows, cols = make_checkerboard(
+        (40, 30), 3, noise=0, random_state=global_random_seed
+    )
+    model.fit(S)
+    assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+    S, rows, cols = make_checkerboard(
+        (30, 40), 3, noise=0, random_state=global_random_seed
+    )
+    model.fit(S)
+    assert consensus_score(model.biclusters_, (rows, cols)) == 1
+
+
+@pytest.mark.parametrize(
+    "params, type_err, err_msg",
+    [
+        (
+            {"n_clusters": 6},
+            ValueError,
+            "n_clusters should be <= n_samples=5",
+        ),
+        (
+            {"n_clusters": (3, 3, 3)},
+            ValueError,
+            "Incorrect parameter n_clusters",
+        ),
+        (
+            {"n_clusters": (3, 6)},
+            ValueError,
+            "Incorrect parameter n_clusters",
+        ),
+        (
+            {"n_components": 3, "n_best": 4},
+            ValueError,
+            "n_best=4 must be <= n_components=3",
+        ),
+    ],
+)
+def test_spectralbiclustering_parameter_validation(params, type_err, err_msg):
+    """Check parameters validation in `SpectralBiClustering`"""
+    data = np.arange(25).reshape((5, 5))
+    model = SpectralBiclustering(**params)
+    with pytest.raises(type_err, match=err_msg):
+        model.fit(data)
+
+
+@pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering()))
+def test_n_features_in_(est):
+    X, _, _ = make_biclusters((3, 3), 3, random_state=0)
+
+    assert not hasattr(est, "n_features_in_")
+    est.fit(X)
+    assert est.n_features_in_ == 3
@@ -0,0 +1,242 @@
+"""
+Tests for the birch clustering algorithm.
+"""
+
+import numpy as np
+import pytest
+
+from sklearn.cluster import AgglomerativeClustering, Birch
+from sklearn.cluster.tests.common import generate_clustered_data
+from sklearn.datasets import make_blobs
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import pairwise_distances_argmin, v_measure_score
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def test_n_samples_leaves_roots(global_random_seed, global_dtype):
+    # Sanity check for the number of samples in leaves and roots
+    X, y = make_blobs(n_samples=10, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc = Birch()
+    brc.fit(X)
+    n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
+    n_samples_leaves = sum(
+        [sc.n_samples_ for leaf in brc._get_leaves() for sc in leaf.subclusters_]
+    )
+    assert n_samples_leaves == X.shape[0]
+    assert n_samples_root == X.shape[0]
+
+
+def test_partial_fit(global_random_seed, global_dtype):
+    # Test that fit is equivalent to calling partial_fit multiple times
+    X, y = make_blobs(n_samples=100, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc = Birch(n_clusters=3)
+    brc.fit(X)
+    brc_partial = Birch(n_clusters=None)
+    brc_partial.partial_fit(X[:50])
+    brc_partial.partial_fit(X[50:])
+    assert_allclose(brc_partial.subcluster_centers_, brc.subcluster_centers_)
+
+    # Test that same global labels are obtained after calling partial_fit
+    # with None
+    brc_partial.set_params(n_clusters=3)
+    brc_partial.partial_fit(None)
+    assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
+
+
+def test_birch_predict(global_random_seed, global_dtype):
+    # Test the predict method predicts the nearest centroid.
+    rng = np.random.RandomState(global_random_seed)
+    X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10)
+    X = X.astype(global_dtype, copy=False)
+
+    # n_samples * n_samples_per_cluster
+    shuffle_indices = np.arange(30)
+    rng.shuffle(shuffle_indices)
+    X_shuffle = X[shuffle_indices, :]
+    brc = Birch(n_clusters=4, threshold=1.0)
+    brc.fit(X_shuffle)
+
+    # Birch must preserve inputs' dtype
+    assert brc.subcluster_centers_.dtype == global_dtype
+
+    assert_array_equal(brc.labels_, brc.predict(X_shuffle))
+    centroids = brc.subcluster_centers_
+    nearest_centroid = brc.subcluster_labels_[
+        pairwise_distances_argmin(X_shuffle, centroids)
+    ]
+    assert_allclose(v_measure_score(nearest_centroid, brc.labels_), 1.0)
+
+
+def test_n_clusters(global_random_seed, global_dtype):
+    # Test that n_clusters param works properly
+    X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc1 = Birch(n_clusters=10)
+    brc1.fit(X)
+    assert len(brc1.subcluster_centers_) > 10
+    assert len(np.unique(brc1.labels_)) == 10
+
+    # Test that n_clusters = Agglomerative Clustering gives
+    # the same results.
+    gc = AgglomerativeClustering(n_clusters=10)
+    brc2 = Birch(n_clusters=gc)
+    brc2.fit(X)
+    assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
+    assert_array_equal(brc1.labels_, brc2.labels_)
+
+    # Test that a small number of clusters raises a warning.
+    brc4 = Birch(threshold=10000.0)
+    with pytest.warns(ConvergenceWarning):
+        brc4.fit(X)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_X(global_random_seed, global_dtype, csr_container):
+    # Test that sparse and dense data give same results
+    X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc = Birch(n_clusters=10)
+    brc.fit(X)
+
+    csr = csr_container(X)
+    brc_sparse = Birch(n_clusters=10)
+    brc_sparse.fit(csr)
+
+    # Birch must preserve inputs' dtype
+    assert brc_sparse.subcluster_centers_.dtype == global_dtype
+
+    assert_array_equal(brc.labels_, brc_sparse.labels_)
+    assert_allclose(brc.subcluster_centers_, brc_sparse.subcluster_centers_)
+
+
+def test_partial_fit_second_call_error_checks():
+    # second partial fit calls will error when n_features is not consistent
+    # with the first call
+    X, y = make_blobs(n_samples=100)
+    brc = Birch(n_clusters=3)
+    brc.partial_fit(X, y)
+
+    msg = "X has 1 features, but Birch is expecting 2 features"
+    with pytest.raises(ValueError, match=msg):
+        brc.partial_fit(X[:, [0]], y)
+
+
+def check_branching_factor(node, branching_factor):
+    subclusters = node.subclusters_
+    assert branching_factor >= len(subclusters)
+    for cluster in subclusters:
+        if cluster.child_:
+            check_branching_factor(cluster.child_, branching_factor)
+
+
+def test_branching_factor(global_random_seed, global_dtype):
+    # Test that nodes have at max branching_factor number of subclusters
+    X, y = make_blobs(random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    branching_factor = 9
+
+    # Purposefully set a low threshold to maximize the subclusters.
+    brc = Birch(n_clusters=None, branching_factor=branching_factor, threshold=0.01)
+    brc.fit(X)
+    check_branching_factor(brc.root_, branching_factor)
+    brc = Birch(n_clusters=3, branching_factor=branching_factor, threshold=0.01)
+    brc.fit(X)
+    check_branching_factor(brc.root_, branching_factor)
+
+
+def check_threshold(birch_instance, threshold):
+    """Use the leaf linked list for traversal"""
+    current_leaf = birch_instance.dummy_leaf_.next_leaf_
+    while current_leaf:
+        subclusters = current_leaf.subclusters_
+        for sc in subclusters:
+            assert threshold >= sc.radius
+        current_leaf = current_leaf.next_leaf_
+
+
+def test_threshold(global_random_seed, global_dtype):
+    # Test that the leaf subclusters have a threshold lesser than radius
+    X, y = make_blobs(n_samples=80, centers=4, random_state=global_random_seed)
+    X = X.astype(global_dtype, copy=False)
+    brc = Birch(threshold=0.5, n_clusters=None)
+    brc.fit(X)
+    check_threshold(brc, 0.5)
+
+    brc = Birch(threshold=5.0, n_clusters=None)
+    brc.fit(X)
+    check_threshold(brc, 5.0)
+
+
+def test_birch_n_clusters_long_int():
+    # Check that birch supports n_clusters with np.int64 dtype, for instance
+    # coming from np.arange. #16484
+    X, _ = make_blobs(random_state=0)
+    n_clusters = np.int64(5)
+    Birch(n_clusters=n_clusters).fit(X)
+
+
+def test_feature_names_out():
+    """Check `get_feature_names_out` for `Birch`."""
+    X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
+    brc = Birch(n_clusters=4)
+    brc.fit(X)
+    n_clusters = brc.subcluster_centers_.shape[0]
+
+    names_out = brc.get_feature_names_out()
+    assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out)
+
+
+def test_transform_match_across_dtypes(global_random_seed):
+    X, _ = make_blobs(n_samples=80, n_features=4, random_state=global_random_seed)
+    brc = Birch(n_clusters=4, threshold=1.1)
+    Y_64 = brc.fit_transform(X)
+    Y_32 = brc.fit_transform(X.astype(np.float32))
+
+    assert_allclose(Y_64, Y_32, atol=1e-6)
+
+
+def test_subcluster_dtype(global_dtype):
+    X = make_blobs(n_samples=80, n_features=4, random_state=0)[0].astype(
+        global_dtype, copy=False
+    )
+    brc = Birch(n_clusters=4)
+    assert brc.fit(X).subcluster_centers_.dtype == global_dtype
+
+
+def test_both_subclusters_updated():
+    """Check that both subclusters are updated when a node a split, even when there are
+    duplicated data points. Non-regression test for #23269.
+    """
+
+    X = np.array(
+        [
+            [-2.6192791, -1.5053215],
+            [-2.9993038, -1.6863596],
+            [-2.3724914, -1.3438171],
+            [-2.336792, -1.3417323],
+            [-2.4089134, -1.3290224],
+            [-2.3724914, -1.3438171],
+            [-3.364009, -1.8846745],
+            [-2.3724914, -1.3438171],
+            [-2.617677, -1.5003285],
+            [-2.2960556, -1.3260119],
+            [-2.3724914, -1.3438171],
+            [-2.5459878, -1.4533926],
+            [-2.25979, -1.3003055],
+            [-2.4089134, -1.3290224],
+            [-2.3724914, -1.3438171],
+            [-2.4089134, -1.3290224],
+            [-2.5459878, -1.4533926],
+            [-2.3724914, -1.3438171],
+            [-2.9720619, -1.7058647],
+            [-2.336792, -1.3417323],
+            [-2.3724914, -1.3438171],
+        ],
+        dtype=np.float32,
+    )
+
+    # no error
+    Birch(branching_factor=5, threshold=1e-5, n_clusters=None).fit(X)
@@ -0,0 +1,158 @@
+import numpy as np
+import pytest
+
+from sklearn.cluster import BisectingKMeans
+from sklearn.metrics import v_measure_score
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+@pytest.mark.parametrize("bisecting_strategy", ["biggest_inertia", "largest_cluster"])
+@pytest.mark.parametrize("init", ["k-means++", "random"])
+def test_three_clusters(bisecting_strategy, init):
+    """Tries to perform bisect k-means for three clusters to check
+    if splitting data is performed correctly.
+    """
+    X = np.array(
+        [[1, 1], [10, 1], [3, 1], [10, 0], [2, 1], [10, 2], [10, 8], [10, 9], [10, 10]]
+    )
+    bisect_means = BisectingKMeans(
+        n_clusters=3,
+        random_state=0,
+        bisecting_strategy=bisecting_strategy,
+        init=init,
+    )
+    bisect_means.fit(X)
+
+    expected_centers = [[2, 1], [10, 1], [10, 9]]
+    expected_labels = [0, 1, 0, 1, 0, 1, 2, 2, 2]
+
+    assert_allclose(
+        sorted(expected_centers), sorted(bisect_means.cluster_centers_.tolist())
+    )
+    assert_allclose(v_measure_score(expected_labels, bisect_means.labels_), 1.0)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse(csr_container):
+    """Test Bisecting K-Means with sparse data.
+
+    Checks if labels and centers are the same between dense and sparse.
+    """
+
+    rng = np.random.RandomState(0)
+
+    X = rng.rand(20, 2)
+    X[X < 0.8] = 0
+    X_csr = csr_container(X)
+
+    bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
+
+    bisect_means.fit(X_csr)
+    sparse_centers = bisect_means.cluster_centers_
+
+    bisect_means.fit(X)
+    normal_centers = bisect_means.cluster_centers_
+
+    # Check if results is the same for dense and sparse data
+    assert_allclose(normal_centers, sparse_centers, atol=1e-8)
+
+
+@pytest.mark.parametrize("n_clusters", [4, 5])
+def test_n_clusters(n_clusters):
+    """Test if resulting labels are in range [0, n_clusters - 1]."""
+
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2)
+
+    bisect_means = BisectingKMeans(n_clusters=n_clusters, random_state=0)
+    bisect_means.fit(X)
+
+    assert_array_equal(np.unique(bisect_means.labels_), np.arange(n_clusters))
+
+
+def test_one_cluster():
+    """Test single cluster."""
+
+    X = np.array([[1, 2], [10, 2], [10, 8]])
+
+    bisect_means = BisectingKMeans(n_clusters=1, random_state=0).fit(X)
+
+    # All labels from fit or predict should be equal 0
+    assert all(bisect_means.labels_ == 0)
+    assert all(bisect_means.predict(X) == 0)
+
+    assert_allclose(bisect_means.cluster_centers_, X.mean(axis=0).reshape(1, -1))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
+def test_fit_predict(csr_container):
+    """Check if labels from fit(X) method are same as from fit(X).predict(X)."""
+    rng = np.random.RandomState(0)
+
+    X = rng.rand(10, 2)
+
+    if csr_container is not None:
+        X[X < 0.8] = 0
+        X = csr_container(X)
+
+    bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
+    bisect_means.fit(X)
+
+    assert_array_equal(bisect_means.labels_, bisect_means.predict(X))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
+def test_dtype_preserved(csr_container, global_dtype):
+    """Check that centers dtype is the same as input data dtype."""
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2).astype(global_dtype, copy=False)
+
+    if csr_container is not None:
+        X[X < 0.8] = 0
+        X = csr_container(X)
+
+    km = BisectingKMeans(n_clusters=3, random_state=0)
+    km.fit(X)
+
+    assert km.cluster_centers_.dtype == global_dtype
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
+def test_float32_float64_equivalence(csr_container):
+    """Check that the results are the same between float32 and float64."""
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2)
+
+    if csr_container is not None:
+        X[X < 0.8] = 0
+        X = csr_container(X)
+
+    km64 = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
+    km32 = BisectingKMeans(n_clusters=3, random_state=0).fit(X.astype(np.float32))
+
+    assert_allclose(km32.cluster_centers_, km64.cluster_centers_)
+    assert_array_equal(km32.labels_, km64.labels_)
+
+
+@pytest.mark.parametrize("algorithm", ("lloyd", "elkan"))
+def test_no_crash_on_empty_bisections(algorithm):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/27081
+    rng = np.random.RandomState(0)
+    X_train = rng.rand(3000, 10)
+    bkm = BisectingKMeans(n_clusters=10, algorithm=algorithm).fit(X_train)
+
+    # predict on scaled data to trigger pathologic case
+    # where the inner mask leads to empty bisections.
+    X_test = 50 * rng.rand(100, 10)
+    labels = bkm.predict(X_test)  # should not crash with idiv by 0
+    assert np.isin(np.unique(labels), np.arange(10)).all()
+
+
+def test_one_feature():
+    # Check that no error is raised when there is only one feature
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/27236
+    X = np.random.normal(size=(128, 1))
+    BisectingKMeans(bisecting_strategy="biggest_inertia", random_state=0).fit(X)
@@ -0,0 +1,434 @@
+"""
+Tests for DBSCAN clustering algorithm
+"""
+
+import pickle
+import warnings
+
+import numpy as np
+import pytest
+from scipy.spatial import distance
+
+from sklearn.cluster import DBSCAN, dbscan
+from sklearn.cluster.tests.common import generate_clustered_data
+from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS, LIL_CONTAINERS
+
+n_clusters = 3
+X = generate_clustered_data(n_clusters=n_clusters)
+
+
+def test_dbscan_similarity():
+    # Tests the DBSCAN algorithm with a similarity array.
+    # Parameters chosen specifically for this task.
+    eps = 0.15
+    min_samples = 10
+    # Compute similarities
+    D = distance.squareform(distance.pdist(X))
+    D /= np.max(D)
+    # Compute DBSCAN
+    core_samples, labels = dbscan(
+        D, metric="precomputed", eps=eps, min_samples=min_samples
+    )
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
+
+    assert n_clusters_1 == n_clusters
+
+    db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
+    labels = db.fit(D).labels_
+
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_2 == n_clusters
+
+
+def test_dbscan_feature():
+    # Tests the DBSCAN algorithm with a feature vector array.
+    # Parameters chosen specifically for this task.
+    # Different eps to other test, because distance is not normalised.
+    eps = 0.8
+    min_samples = 10
+    metric = "euclidean"
+    # Compute DBSCAN
+    # parameters chosen for task
+    core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples)
+
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_1 == n_clusters
+
+    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
+    labels = db.fit(X).labels_
+
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_2 == n_clusters
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_dbscan_sparse(lil_container):
+    core_sparse, labels_sparse = dbscan(lil_container(X), eps=0.8, min_samples=10)
+    core_dense, labels_dense = dbscan(X, eps=0.8, min_samples=10)
+    assert_array_equal(core_dense, core_sparse)
+    assert_array_equal(labels_dense, labels_sparse)
+
+
+@pytest.mark.parametrize("include_self", [False, True])
+def test_dbscan_sparse_precomputed(include_self):
+    D = pairwise_distances(X)
+    nn = NearestNeighbors(radius=0.9).fit(X)
+    X_ = X if include_self else None
+    D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance")
+    # Ensure it is sparse not merely on diagonals:
+    assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
+    core_sparse, labels_sparse = dbscan(
+        D_sparse, eps=0.8, min_samples=10, metric="precomputed"
+    )
+    core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed")
+    assert_array_equal(core_dense, core_sparse)
+    assert_array_equal(labels_dense, labels_sparse)
+
+
+def test_dbscan_sparse_precomputed_different_eps():
+    # test that precomputed neighbors graph is filtered if computed with
+    # a radius larger than DBSCAN's eps.
+    lower_eps = 0.2
+    nn = NearestNeighbors(radius=lower_eps).fit(X)
+    D_sparse = nn.radius_neighbors_graph(X, mode="distance")
+    dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
+
+    higher_eps = lower_eps + 0.7
+    nn = NearestNeighbors(radius=higher_eps).fit(X)
+    D_sparse = nn.radius_neighbors_graph(X, mode="distance")
+    dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
+
+    assert_array_equal(dbscan_lower[0], dbscan_higher[0])
+    assert_array_equal(dbscan_lower[1], dbscan_higher[1])
+
+
+@pytest.mark.parametrize("metric", ["precomputed", "minkowski"])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
+def test_dbscan_input_not_modified(metric, csr_container):
+    # test that the input is not modified by dbscan
+    X = np.random.RandomState(0).rand(10, 10)
+    X = csr_container(X) if csr_container is not None else X
+    X_copy = X.copy()
+    dbscan(X, metric=metric)
+
+    if csr_container is not None:
+        assert_array_equal(X.toarray(), X_copy.toarray())
+    else:
+        assert_array_equal(X, X_copy)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container):
+    """Check that we don't modify in-place the pre-computed sparse matrix.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27508
+    """
+    X = np.random.RandomState(0).rand(10, 10)
+    # Add zeros on the diagonal that will be implicit when creating
+    # the sparse matrix. If `X` is modified in-place, the zeros from
+    # the diagonal will be made explicit.
+    np.fill_diagonal(X, 0)
+    X = csr_container(X)
+    assert all(row != col for row, col in zip(*X.nonzero()))
+    X_copy = X.copy()
+    dbscan(X, metric="precomputed")
+    # Make sure that we did not modify `X` in-place even by creating
+    # explicit 0s values.
+    assert X.nnz == X_copy.nnz
+    assert_array_equal(X.toarray(), X_copy.toarray())
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_dbscan_no_core_samples(csr_container):
+    rng = np.random.RandomState(0)
+    X = rng.rand(40, 10)
+    X[X < 0.8] = 0
+
+    for X_ in [X, csr_container(X)]:
+        db = DBSCAN(min_samples=6).fit(X_)
+        assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
+        assert_array_equal(db.labels_, -1)
+        assert db.core_sample_indices_.shape == (0,)
+
+
+def test_dbscan_callable():
+    # Tests the DBSCAN algorithm with a callable metric.
+    # Parameters chosen specifically for this task.
+    # Different eps to other test, because distance is not normalised.
+    eps = 0.8
+    min_samples = 10
+    # metric is the function reference, not the string key.
+    metric = distance.euclidean
+    # Compute DBSCAN
+    # parameters chosen for task
+    core_samples, labels = dbscan(
+        X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree"
+    )
+
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_1 == n_clusters
+
+    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_2 == n_clusters
+
+
+def test_dbscan_metric_params():
+    # Tests that DBSCAN works with the metrics_params argument.
+    eps = 0.8
+    min_samples = 10
+    p = 1
+
+    # Compute DBSCAN with metric_params arg
+
+    with warnings.catch_warnings(record=True) as warns:
+        db = DBSCAN(
+            metric="minkowski",
+            metric_params={"p": p},
+            eps=eps,
+            p=None,
+            min_samples=min_samples,
+            algorithm="ball_tree",
+        ).fit(X)
+    assert not warns, warns[0].message
+    core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
+
+    # Test that sample labels are the same as passing Minkowski 'p' directly
+    db = DBSCAN(
+        metric="minkowski", eps=eps, min_samples=min_samples, algorithm="ball_tree", p=p
+    ).fit(X)
+    core_sample_2, labels_2 = db.core_sample_indices_, db.labels_
+
+    assert_array_equal(core_sample_1, core_sample_2)
+    assert_array_equal(labels_1, labels_2)
+
+    # Minkowski with p=1 should be equivalent to Manhattan distance
+    db = DBSCAN(
+        metric="manhattan", eps=eps, min_samples=min_samples, algorithm="ball_tree"
+    ).fit(X)
+    core_sample_3, labels_3 = db.core_sample_indices_, db.labels_
+
+    assert_array_equal(core_sample_1, core_sample_3)
+    assert_array_equal(labels_1, labels_3)
+
+    with pytest.warns(
+        SyntaxWarning,
+        match=(
+            "Parameter p is found in metric_params. "
+            "The corresponding parameter from __init__ "
+            "is ignored."
+        ),
+    ):
+        # Test that checks p is ignored in favor of metric_params={'p': <val>}
+        db = DBSCAN(
+            metric="minkowski",
+            metric_params={"p": p},
+            eps=eps,
+            p=p + 1,
+            min_samples=min_samples,
+            algorithm="ball_tree",
+        ).fit(X)
+        core_sample_4, labels_4 = db.core_sample_indices_, db.labels_
+
+    assert_array_equal(core_sample_1, core_sample_4)
+    assert_array_equal(labels_1, labels_4)
+
+
+def test_dbscan_balltree():
+    # Tests the DBSCAN algorithm with balltree for neighbor calculation.
+    eps = 0.8
+    min_samples = 10
+
+    D = pairwise_distances(X)
+    core_samples, labels = dbscan(
+        D, metric="precomputed", eps=eps, min_samples=min_samples
+    )
+
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_1 == n_clusters
+
+    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_2 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_2 == n_clusters
+
+    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_3 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_3 == n_clusters
+
+    db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_4 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_4 == n_clusters
+
+    db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree")
+    labels = db.fit(X).labels_
+
+    n_clusters_5 = len(set(labels)) - int(-1 in labels)
+    assert n_clusters_5 == n_clusters
+
+
+def test_input_validation():
+    # DBSCAN.fit should accept a list of lists.
+    X = [[1.0, 2.0], [3.0, 4.0]]
+    DBSCAN().fit(X)  # must not raise exception
+
+
+def test_pickle():
+    obj = DBSCAN()
+    s = pickle.dumps(obj)
+    assert type(pickle.loads(s)) == obj.__class__
+
+
+def test_boundaries():
+    # ensure min_samples is inclusive of core point
+    core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
+    assert 0 in core
+    # ensure eps is inclusive of circumference
+    core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
+    assert 0 in core
+    core, _ = dbscan([[0], [1], [1]], eps=0.99, min_samples=2)
+    assert 0 not in core
+
+
+def test_weighted_dbscan(global_random_seed):
+    # ensure sample_weight is validated
+    with pytest.raises(ValueError):
+        dbscan([[0], [1]], sample_weight=[2])
+    with pytest.raises(ValueError):
+        dbscan([[0], [1]], sample_weight=[2, 3, 4])
+
+    # ensure sample_weight has an effect
+    assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0])
+    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0])
+    assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0])
+    assert_array_equal(
+        [0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]
+    )
+
+    # points within eps of each other:
+    assert_array_equal(
+        [0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]
+    )
+    # and effect of non-positive and non-integer sample_weight:
+    assert_array_equal(
+        [], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]
+    )
+    assert_array_equal(
+        [0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]
+    )
+    assert_array_equal(
+        [0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]
+    )
+    assert_array_equal(
+        [], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]
+    )
+
+    # for non-negative sample_weight, cores should be identical to repetition
+    rng = np.random.RandomState(global_random_seed)
+    sample_weight = rng.randint(0, 5, X.shape[0])
+    core1, label1 = dbscan(X, sample_weight=sample_weight)
+    assert len(label1) == len(X)
+
+    X_repeated = np.repeat(X, sample_weight, axis=0)
+    core_repeated, label_repeated = dbscan(X_repeated)
+    core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
+    core_repeated_mask[core_repeated] = True
+    core_mask = np.zeros(X.shape[0], dtype=bool)
+    core_mask[core1] = True
+    assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)
+
+    # sample_weight should work with precomputed distance matrix
+    D = pairwise_distances(X)
+    core3, label3 = dbscan(D, sample_weight=sample_weight, metric="precomputed")
+    assert_array_equal(core1, core3)
+    assert_array_equal(label1, label3)
+
+    # sample_weight should work with estimator
+    est = DBSCAN().fit(X, sample_weight=sample_weight)
+    core4 = est.core_sample_indices_
+    label4 = est.labels_
+    assert_array_equal(core1, core4)
+    assert_array_equal(label1, label4)
+
+    est = DBSCAN()
+    label5 = est.fit_predict(X, sample_weight=sample_weight)
+    core5 = est.core_sample_indices_
+    assert_array_equal(core1, core5)
+    assert_array_equal(label1, label5)
+    assert_array_equal(label1, est.labels_)
+
+
+@pytest.mark.parametrize("algorithm", ["brute", "kd_tree", "ball_tree"])
+def test_dbscan_core_samples_toy(algorithm):
+    X = [[0], [2], [3], [4], [6], [8], [10]]
+    n_samples = len(X)
+
+    # Degenerate case: every sample is a core sample, either with its own
+    # cluster or including other close core samples.
+    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1)
+    assert_array_equal(core_samples, np.arange(n_samples))
+    assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])
+
+    # With eps=1 and min_samples=2 only the 3 samples from the denser area
+    # are core samples. All other points are isolated and considered noise.
+    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2)
+    assert_array_equal(core_samples, [1, 2, 3])
+    assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
+
+    # Only the sample in the middle of the dense area is core. Its two
+    # neighbors are edge samples. Remaining samples are noise.
+    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3)
+    assert_array_equal(core_samples, [2])
+    assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
+
+    # It's no longer possible to extract core samples with eps=1:
+    # everything is noise.
+    core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4)
+    assert_array_equal(core_samples, [])
+    assert_array_equal(labels, np.full(n_samples, -1.0))
+
+
+def test_dbscan_precomputed_metric_with_degenerate_input_arrays():
+    # see https://github.com/scikit-learn/scikit-learn/issues/4641 for
+    # more details
+    X = np.eye(10)
+    labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
+    assert len(set(labels)) == 1
+
+    X = np.zeros((10, 10))
+    labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
+    assert len(set(labels)) == 1
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_dbscan_precomputed_metric_with_initial_rows_zero(csr_container):
+    # sample matrix with initial two row all zero
+    ar = np.array(
+        [
+            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
+            [0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3],
+            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
+            [0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0],
+        ]
+    )
+    matrix = csr_container(ar)
+    labels = DBSCAN(eps=0.2, metric="precomputed", min_samples=2).fit(matrix).labels_
+    assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])
@@ -0,0 +1,81 @@
+"""
+Tests for sklearn.cluster._feature_agglomeration
+"""
+
+# Authors: Sergul Aydore 2017
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from sklearn.cluster import FeatureAgglomeration
+from sklearn.datasets import make_blobs
+from sklearn.utils._testing import assert_array_almost_equal
+
+
+def test_feature_agglomeration():
+    n_clusters = 1
+    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)
+
+    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean)
+    agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median)
+    agglo_mean.fit(X)
+    agglo_median.fit(X)
+
+    assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
+    assert np.size(np.unique(agglo_median.labels_)) == n_clusters
+    assert np.size(agglo_mean.labels_) == X.shape[1]
+    assert np.size(agglo_median.labels_) == X.shape[1]
+
+    # Test transform
+    Xt_mean = agglo_mean.transform(X)
+    Xt_median = agglo_median.transform(X)
+    assert Xt_mean.shape[1] == n_clusters
+    assert Xt_median.shape[1] == n_clusters
+    assert Xt_mean == np.array([1 / 3.0])
+    assert Xt_median == np.array([0.0])
+
+    # Test inverse transform
+    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
+    X_full_median = agglo_median.inverse_transform(Xt_median)
+    assert np.unique(X_full_mean[0]).size == n_clusters
+    assert np.unique(X_full_median[0]).size == n_clusters
+
+    assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
+    assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
+
+
+def test_feature_agglomeration_feature_names_out():
+    """Check `get_feature_names_out` for `FeatureAgglomeration`."""
+    X, _ = make_blobs(n_features=6, random_state=0)
+    agglo = FeatureAgglomeration(n_clusters=3)
+    agglo.fit(X)
+    n_clusters = agglo.n_clusters_
+
+    names_out = agglo.get_feature_names_out()
+    assert_array_equal(
+        [f"featureagglomeration{i}" for i in range(n_clusters)], names_out
+    )
+
+
+# TODO(1.7): remove this test
+def test_inverse_transform_Xt_deprecation():
+    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)
+
+    est = FeatureAgglomeration(n_clusters=1, pooling_func=np.mean)
+    est.fit(X)
+    X = est.transform(X)
+
+    with pytest.raises(TypeError, match="Missing required positional argument"):
+        est.inverse_transform()
+
+    with pytest.raises(TypeError, match="Cannot use both X and Xt. Use X only."):
+        est.inverse_transform(X=X, Xt=X)
+
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("error")
+        est.inverse_transform(X)
+
+    with pytest.warns(FutureWarning, match="Xt was renamed X in version 1.5"):
+        est.inverse_transform(Xt=X)
@@ -0,0 +1,602 @@
+"""
+Tests for HDBSCAN clustering algorithm
+Based on the DBSCAN test code
+"""
+
+import numpy as np
+import pytest
+from scipy import stats
+from scipy.spatial import distance
+
+from sklearn.cluster import HDBSCAN
+from sklearn.cluster._hdbscan._tree import (
+    CONDENSED_dtype,
+    _condense_tree,
+    _do_labelling,
+)
+from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING
+from sklearn.datasets import make_blobs
+from sklearn.metrics import fowlkes_mallows_score
+from sklearn.metrics.pairwise import _VALID_METRICS, euclidean_distances
+from sklearn.neighbors import BallTree, KDTree
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils import shuffle
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
+
+X, y = make_blobs(n_samples=200, random_state=10)
+X, y = shuffle(X, y, random_state=7)
+X = StandardScaler().fit_transform(X)
+
+ALGORITHMS = [
+    "kd_tree",
+    "ball_tree",
+    "brute",
+    "auto",
+]
+
+OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()}
+
+
+def check_label_quality(labels, threshold=0.99):
+    n_clusters = len(set(labels) - OUTLIER_SET)
+    assert n_clusters == 3
+    assert fowlkes_mallows_score(labels, y) > threshold
+
+
+@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING)
+def test_outlier_data(outlier_type):
+    """
+    Tests if np.inf and np.nan data are each treated as special outliers.
+    """
+    outlier = {
+        "infinite": np.inf,
+        "missing": np.nan,
+    }[outlier_type]
+    prob_check = {
+        "infinite": lambda x, y: x == y,
+        "missing": lambda x, y: np.isnan(x),
+    }[outlier_type]
+    label = _OUTLIER_ENCODING[outlier_type]["label"]
+    prob = _OUTLIER_ENCODING[outlier_type]["prob"]
+
+    X_outlier = X.copy()
+    X_outlier[0] = [outlier, 1]
+    X_outlier[5] = [outlier, outlier]
+    model = HDBSCAN().fit(X_outlier)
+
+    (missing_labels_idx,) = (model.labels_ == label).nonzero()
+    assert_array_equal(missing_labels_idx, [0, 5])
+
+    (missing_probs_idx,) = (prob_check(model.probabilities_, prob)).nonzero()
+    assert_array_equal(missing_probs_idx, [0, 5])
+
+    clean_indices = list(range(1, 5)) + list(range(6, 200))
+    clean_model = HDBSCAN().fit(X_outlier[clean_indices])
+    assert_array_equal(clean_model.labels_, model.labels_[clean_indices])
+
+
+def test_hdbscan_distance_matrix():
+    """
+    Tests that HDBSCAN works with precomputed distance matrices, and throws the
+    appropriate errors when needed.
+    """
+    D = euclidean_distances(X)
+    D_original = D.copy()
+    labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D)
+
+    assert_allclose(D, D_original)
+    check_label_quality(labels)
+
+    msg = r"The precomputed distance matrix.*has shape"
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed", copy=True).fit_predict(X)
+
+    msg = r"The precomputed distance matrix.*values"
+    # Ensure the matrix is not symmetric
+    D[0, 1] = 10
+    D[1, 0] = 1
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed").fit_predict(D)
+
+
+@pytest.mark.parametrize("sparse_constructor", [*CSR_CONTAINERS, *CSC_CONTAINERS])
+def test_hdbscan_sparse_distance_matrix(sparse_constructor):
+    """
+    Tests that HDBSCAN works with sparse distance matrices.
+    """
+    D = distance.squareform(distance.pdist(X))
+    D /= np.max(D)
+
+    threshold = stats.scoreatpercentile(D.flatten(), 50)
+
+    D[D >= threshold] = 0.0
+    D = sparse_constructor(D)
+    D.eliminate_zeros()
+
+    labels = HDBSCAN(metric="precomputed").fit_predict(D)
+    check_label_quality(labels)
+
+
+def test_hdbscan_feature_array():
+    """
+    Tests that HDBSCAN works with feature array, including an arbitrary
+    goodness of fit check. Note that the check is a simple heuristic.
+    """
+    labels = HDBSCAN().fit_predict(X)
+
+    # Check that clustering is arbitrarily good
+    # This is a heuristic to guard against regression
+    check_label_quality(labels)
+
+
+@pytest.mark.parametrize("algo", ALGORITHMS)
+@pytest.mark.parametrize("metric", _VALID_METRICS)
+def test_hdbscan_algorithms(algo, metric):
+    """
+    Tests that HDBSCAN works with the expected combinations of algorithms and
+    metrics, or raises the expected errors.
+    """
+    labels = HDBSCAN(algorithm=algo).fit_predict(X)
+    check_label_quality(labels)
+
+    # Validation for brute is handled by `pairwise_distances`
+    if algo in ("brute", "auto"):
+        return
+
+    ALGOS_TREES = {
+        "kd_tree": KDTree,
+        "ball_tree": BallTree,
+    }
+    metric_params = {
+        "mahalanobis": {"V": np.eye(X.shape[1])},
+        "seuclidean": {"V": np.ones(X.shape[1])},
+        "minkowski": {"p": 2},
+        "wminkowski": {"p": 2, "w": np.ones(X.shape[1])},
+    }.get(metric, None)
+
+    hdb = HDBSCAN(
+        algorithm=algo,
+        metric=metric,
+        metric_params=metric_params,
+    )
+
+    if metric not in ALGOS_TREES[algo].valid_metrics:
+        with pytest.raises(ValueError):
+            hdb.fit(X)
+    elif metric == "wminkowski":
+        with pytest.warns(FutureWarning):
+            hdb.fit(X)
+    else:
+        hdb.fit(X)
+
+
+def test_dbscan_clustering():
+    """
+    Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
+    This test is more of a sanity check than a rigorous evaluation.
+    """
+    clusterer = HDBSCAN().fit(X)
+    labels = clusterer.dbscan_clustering(0.3)
+
+    # We use a looser threshold due to dbscan producing a more constrained
+    # clustering representation
+    check_label_quality(labels, threshold=0.92)
+
+
+@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
+def test_dbscan_clustering_outlier_data(cut_distance):
+    """
+    Tests if np.inf and np.nan data are each treated as special outliers.
+    """
+    missing_label = _OUTLIER_ENCODING["missing"]["label"]
+    infinite_label = _OUTLIER_ENCODING["infinite"]["label"]
+
+    X_outlier = X.copy()
+    X_outlier[0] = [np.inf, 1]
+    X_outlier[2] = [1, np.nan]
+    X_outlier[5] = [np.inf, np.nan]
+    model = HDBSCAN().fit(X_outlier)
+    labels = model.dbscan_clustering(cut_distance=cut_distance)
+
+    missing_labels_idx = np.flatnonzero(labels == missing_label)
+    assert_array_equal(missing_labels_idx, [2, 5])
+
+    infinite_labels_idx = np.flatnonzero(labels == infinite_label)
+    assert_array_equal(infinite_labels_idx, [0])
+
+    clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx))
+    clean_model = HDBSCAN().fit(X_outlier[clean_idx])
+    clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
+    assert_array_equal(clean_labels, labels[clean_idx])
+
+
+def test_hdbscan_best_balltree_metric():
+    """
+    Tests that HDBSCAN using `BallTree` works.
+    """
+    labels = HDBSCAN(
+        metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}
+    ).fit_predict(X)
+    check_label_quality(labels)
+
+
+def test_hdbscan_no_clusters():
+    """
+    Tests that HDBSCAN correctly does not generate a valid cluster when the
+    `min_cluster_size` is too large for the data.
+    """
+    labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X)
+    assert set(labels).issubset(OUTLIER_SET)
+
+
+def test_hdbscan_min_cluster_size():
+    """
+    Test that the smallest non-noise cluster has at least `min_cluster_size`
+    many points
+    """
+    for min_cluster_size in range(2, len(X), 1):
+        labels = HDBSCAN(min_cluster_size=min_cluster_size).fit_predict(X)
+        true_labels = [label for label in labels if label != -1]
+        if len(true_labels) != 0:
+            assert np.min(np.bincount(true_labels)) >= min_cluster_size
+
+
+def test_hdbscan_callable_metric():
+    """
+    Tests that HDBSCAN works when passed a callable metric.
+    """
+    metric = distance.euclidean
+    labels = HDBSCAN(metric=metric).fit_predict(X)
+    check_label_quality(labels)
+
+
+@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"])
+def test_hdbscan_precomputed_non_brute(tree):
+    """
+    Tests that HDBSCAN correctly raises an error when passing precomputed data
+    while requesting a tree-based algorithm.
+    """
+    hdb = HDBSCAN(metric="precomputed", algorithm=tree)
+    msg = "precomputed is not a valid metric for"
+    with pytest.raises(ValueError, match=msg):
+        hdb.fit(X)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_hdbscan_sparse(csr_container):
+    """
+    Tests that HDBSCAN works correctly when passing sparse feature data.
+    Evaluates correctness by comparing against the same data passed as a dense
+    array.
+    """
+
+    dense_labels = HDBSCAN().fit(X).labels_
+    check_label_quality(dense_labels)
+
+    _X_sparse = csr_container(X)
+    X_sparse = _X_sparse.copy()
+    sparse_labels = HDBSCAN().fit(X_sparse).labels_
+    assert_array_equal(dense_labels, sparse_labels)
+
+    # Compare that the sparse and dense non-precomputed routines return the same labels
+    # where the 0th observation contains the outlier.
+    for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")):
+        X_dense = X.copy()
+        X_dense[0, 0] = outlier_val
+        dense_labels = HDBSCAN().fit(X_dense).labels_
+        check_label_quality(dense_labels)
+        assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
+
+        X_sparse = _X_sparse.copy()
+        X_sparse[0, 0] = outlier_val
+        sparse_labels = HDBSCAN().fit(X_sparse).labels_
+        assert_array_equal(dense_labels, sparse_labels)
+
+    msg = "Sparse data matrices only support algorithm `brute`."
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse)
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_hdbscan_centers(algorithm):
+    """
+    Tests that HDBSCAN centers are calculated and stored properly, and are
+    accurate to the data.
+    """
+    centers = [(0.0, 0.0), (3.0, 3.0)]
+    H, _ = make_blobs(n_samples=2000, random_state=0, centers=centers, cluster_std=0.5)
+    hdb = HDBSCAN(store_centers="both").fit(H)
+
+    for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_):
+        assert_allclose(center, centroid, rtol=1, atol=0.05)
+        assert_allclose(center, medoid, rtol=1, atol=0.05)
+
+    # Ensure that nothing is done for noise
+    hdb = HDBSCAN(
+        algorithm=algorithm, store_centers="both", min_cluster_size=X.shape[0]
+    ).fit(X)
+    assert hdb.centroids_.shape[0] == 0
+    assert hdb.medoids_.shape[0] == 0
+
+
+def test_hdbscan_allow_single_cluster_with_epsilon():
+    """
+    Tests that HDBSCAN single-cluster selection with epsilon works correctly.
+    """
+    rng = np.random.RandomState(0)
+    no_structure = rng.rand(150, 2)
+    # without epsilon we should see many noise points as children of root.
+    labels = HDBSCAN(
+        min_cluster_size=5,
+        cluster_selection_epsilon=0.0,
+        cluster_selection_method="eom",
+        allow_single_cluster=True,
+    ).fit_predict(no_structure)
+    unique_labels, counts = np.unique(labels, return_counts=True)
+    assert len(unique_labels) == 2
+
+    # Arbitrary heuristic. Would prefer something more precise.
+    assert counts[unique_labels == -1] > 30
+
+    # for this random seed an epsilon of 0.18 will produce exactly 2 noise
+    # points at that cut in single linkage.
+    labels = HDBSCAN(
+        min_cluster_size=5,
+        cluster_selection_epsilon=0.18,
+        cluster_selection_method="eom",
+        allow_single_cluster=True,
+        algorithm="kd_tree",
+    ).fit_predict(no_structure)
+    unique_labels, counts = np.unique(labels, return_counts=True)
+    assert len(unique_labels) == 2
+    assert counts[unique_labels == -1] == 2
+
+
+def test_hdbscan_better_than_dbscan():
+    """
+    Validate that HDBSCAN can properly cluster this difficult synthetic
+    dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
+    example)
+    """
+    centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]]
+    X, y = make_blobs(
+        n_samples=750,
+        centers=centers,
+        cluster_std=[0.2, 0.35, 1.35, 1.35],
+        random_state=0,
+    )
+    labels = HDBSCAN().fit(X).labels_
+
+    n_clusters = len(set(labels)) - int(-1 in labels)
+    assert n_clusters == 4
+    fowlkes_mallows_score(labels, y) > 0.99
+
+
+@pytest.mark.parametrize(
+    "kwargs, X",
+    [
+        ({"metric": "precomputed"}, np.array([[1, np.inf], [np.inf, 1]])),
+        ({"metric": "precomputed"}, [[1, 2], [2, 1]]),
+        ({}, [[1, 2], [3, 4]]),
+    ],
+)
+def test_hdbscan_usable_inputs(X, kwargs):
+    """
+    Tests that HDBSCAN works correctly for array-likes and precomputed inputs
+    with non-finite points.
+    """
+    HDBSCAN(min_samples=1, **kwargs).fit(X)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_hdbscan_sparse_distances_too_few_nonzero(csr_container):
+    """
+    Tests that HDBSCAN raises the correct error when there are too few
+    non-zero distances.
+    """
+    X = csr_container(np.zeros((10, 10)))
+
+    msg = "There exists points with fewer than"
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed").fit(X)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_hdbscan_sparse_distances_disconnected_graph(csr_container):
+    """
+    Tests that HDBSCAN raises the correct error when the distance matrix
+    has multiple connected components.
+    """
+    # Create symmetric sparse matrix with 2 connected components
+    X = np.zeros((20, 20))
+    X[:5, :5] = 1
+    X[5:, 15:] = 1
+    X = X + X.T
+    X = csr_container(X)
+    msg = "HDBSCAN cannot be perfomed on a disconnected graph"
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(metric="precomputed").fit(X)
+
+
+def test_hdbscan_tree_invalid_metric():
+    """
+    Tests that HDBSCAN correctly raises an error for invalid metric choices.
+    """
+    metric_callable = lambda x: x
+    msg = (
+        ".* is not a valid metric for a .*-based algorithm\\. Please select a different"
+        " metric\\."
+    )
+
+    # Callables are not supported for either
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(algorithm="kd_tree", metric=metric_callable).fit(X)
+    with pytest.raises(ValueError, match=msg):
+        HDBSCAN(algorithm="ball_tree", metric=metric_callable).fit(X)
+
+    # The set of valid metrics for KDTree at the time of writing this test is a
+    # strict subset of those supported in BallTree
+    metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics))
+    if len(metrics_not_kd) > 0:
+        with pytest.raises(ValueError, match=msg):
+            HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0]).fit(X)
+
+
+def test_hdbscan_too_many_min_samples():
+    """
+    Tests that HDBSCAN correctly raises an error when setting `min_samples`
+    larger than the number of samples.
+    """
+    hdb = HDBSCAN(min_samples=len(X) + 1)
+    msg = r"min_samples (.*) must be at most"
+    with pytest.raises(ValueError, match=msg):
+        hdb.fit(X)
+
+
+def test_hdbscan_precomputed_dense_nan():
+    """
+    Tests that HDBSCAN correctly raises an error when providing precomputed
+    distances with `np.nan` values.
+    """
+    X_nan = X.copy()
+    X_nan[0, 0] = np.nan
+    msg = "np.nan values found in precomputed-dense"
+    hdb = HDBSCAN(metric="precomputed")
+    with pytest.raises(ValueError, match=msg):
+        hdb.fit(X_nan)
+
+
+@pytest.mark.parametrize("allow_single_cluster", [True, False])
+@pytest.mark.parametrize("epsilon", [0, 0.1])
+def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon):
+    """
+    Tests that the `_do_labelling` helper function correctly assigns labels.
+    """
+    n_samples = 48
+    X, y = make_blobs(
+        n_samples,
+        random_state=global_random_seed,
+        # Ensure the clusters are distinct with no overlap
+        centers=[
+            [0, 0],
+            [10, 0],
+            [0, 10],
+        ],
+    )
+
+    est = HDBSCAN().fit(X)
+    condensed_tree = _condense_tree(
+        est._single_linkage_tree_, min_cluster_size=est.min_cluster_size
+    )
+    clusters = {n_samples + 2, n_samples + 3, n_samples + 4}
+    cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2}
+    labels = _do_labelling(
+        condensed_tree=condensed_tree,
+        clusters=clusters,
+        cluster_label_map=cluster_label_map,
+        allow_single_cluster=allow_single_cluster,
+        cluster_selection_epsilon=epsilon,
+    )
+
+    first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))}
+    y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))}
+    aligned_target = np.vectorize(y_to_labels.get)(y)
+    assert_array_equal(labels, aligned_target)
+
+
+def test_labelling_thresholding():
+    """
+    Tests that the `_do_labelling` helper function correctly thresholds the
+    incoming lambda values given various `cluster_selection_epsilon` values.
+    """
+    n_samples = 5
+    MAX_LAMBDA = 1.5
+    condensed_tree = np.array(
+        [
+            (5, 2, MAX_LAMBDA, 1),
+            (5, 1, 0.1, 1),
+            (5, 0, MAX_LAMBDA, 1),
+            (5, 3, 0.2, 1),
+            (5, 4, 0.3, 1),
+        ],
+        dtype=CONDENSED_dtype,
+    )
+    labels = _do_labelling(
+        condensed_tree=condensed_tree,
+        clusters={n_samples},
+        cluster_label_map={n_samples: 0, n_samples + 1: 1},
+        allow_single_cluster=True,
+        cluster_selection_epsilon=1,
+    )
+    num_noise = condensed_tree["value"] < 1
+    assert sum(num_noise) == sum(labels == -1)
+
+    labels = _do_labelling(
+        condensed_tree=condensed_tree,
+        clusters={n_samples},
+        cluster_label_map={n_samples: 0, n_samples + 1: 1},
+        allow_single_cluster=True,
+        cluster_selection_epsilon=0,
+    )
+    # The threshold should be calculated per-sample based on the largest
+    # lambda of any simbling node. In this case, all points are siblings
+    # and the largest value is exactly MAX_LAMBDA.
+    num_noise = condensed_tree["value"] < MAX_LAMBDA
+    assert sum(num_noise) == sum(labels == -1)
+
+
+# TODO(1.6): Remove
+def test_hdbscan_warning_on_deprecated_algorithm_name():
+    # Test that warning message is shown when algorithm='kdtree'
+    msg = (
+        "`algorithm='kdtree'`has been deprecated in 1.4 and will be renamed"
+        " to'kd_tree'`in 1.6. To keep the past behaviour, set `algorithm='kd_tree'`."
+    )
+    with pytest.warns(FutureWarning, match=msg):
+        HDBSCAN(algorithm="kdtree").fit(X)
+
+    # Test that warning message is shown when algorithm='balltree'
+    msg = (
+        "`algorithm='balltree'`has been deprecated in 1.4 and will be renamed"
+        " to'ball_tree'`in 1.6. To keep the past behaviour, set"
+        " `algorithm='ball_tree'`."
+    )
+    with pytest.warns(FutureWarning, match=msg):
+        HDBSCAN(algorithm="balltree").fit(X)
+
+
+@pytest.mark.parametrize("store_centers", ["centroid", "medoid"])
+def test_hdbscan_error_precomputed_and_store_centers(store_centers):
+    """Check that we raise an error if the centers are requested together with
+    a precomputed input matrix.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27893
+    """
+    rng = np.random.RandomState(0)
+    X = rng.random((100, 2))
+    X_dist = euclidean_distances(X)
+    err_msg = "Cannot store centers when using a precomputed distance matrix."
+    with pytest.raises(ValueError, match=err_msg):
+        HDBSCAN(metric="precomputed", store_centers=store_centers).fit(X_dist)
+
+
+@pytest.mark.parametrize("valid_algo", ["auto", "brute"])
+def test_hdbscan_cosine_metric_valid_algorithm(valid_algo):
+    """Test that HDBSCAN works with the "cosine" metric when the algorithm is set
+    to "brute" or "auto".
+
+    Non-regression test for issue #28631
+    """
+    HDBSCAN(metric="cosine", algorithm=valid_algo).fit_predict(X)
+
+
+@pytest.mark.parametrize("invalid_algo", ["kd_tree", "ball_tree"])
+def test_hdbscan_cosine_metric_invalid_algorithm(invalid_algo):
+    """Test that HDBSCAN raises an informative error is raised when an unsupported
+    algorithm is used with the "cosine" metric.
+    """
+    hdbscan = HDBSCAN(metric="cosine", algorithm=invalid_algo)
+    with pytest.raises(ValueError, match="cosine is not a valid metric"):
+        hdbscan.fit_predict(X)
@@ -0,0 +1,900 @@
+"""
+Several basic tests for hierarchical clustering procedures
+
+"""
+
+# Authors: Vincent Michel, 2010, Gael Varoquaux 2012,
+#          Matteo Visconti di Oleggio Castello 2014
+# License: BSD 3 clause
+import itertools
+import shutil
+from functools import partial
+from tempfile import mkdtemp
+
+import numpy as np
+import pytest
+from scipy.cluster import hierarchy
+from scipy.sparse.csgraph import connected_components
+
+from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration, ward_tree
+from sklearn.cluster._agglomerative import (
+    _TREE_BUILDERS,
+    _fix_connectivity,
+    _hc_cut,
+    linkage_tree,
+)
+from sklearn.cluster._hierarchical_fast import (
+    average_merge,
+    max_merge,
+    mst_linkage_core,
+)
+from sklearn.datasets import make_circles, make_moons
+from sklearn.feature_extraction.image import grid_to_graph
+from sklearn.metrics import DistanceMetric
+from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score
+from sklearn.metrics.pairwise import (
+    PAIRED_DISTANCES,
+    cosine_distances,
+    manhattan_distances,
+    pairwise_distances,
+)
+from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
+from sklearn.neighbors import kneighbors_graph
+from sklearn.utils._fast_dict import IntFloatDict
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    create_memmap_backed_data,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import LIL_CONTAINERS
+
+
+def test_linkage_misc():
+    # Misc tests on linkage
+    rng = np.random.RandomState(42)
+    X = rng.normal(size=(5, 5))
+
+    with pytest.raises(ValueError):
+        linkage_tree(X, linkage="foo")
+
+    with pytest.raises(ValueError):
+        linkage_tree(X, connectivity=np.ones((4, 4)))
+
+    # Smoke test FeatureAgglomeration
+    FeatureAgglomeration().fit(X)
+
+    # test hierarchical clustering on a precomputed distances matrix
+    dis = cosine_distances(X)
+
+    res = linkage_tree(dis, affinity="precomputed")
+    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
+
+    # test hierarchical clustering on a precomputed distances matrix
+    res = linkage_tree(X, affinity=manhattan_distances)
+    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
+
+
+def test_structured_linkage_tree():
+    # Check that we obtain the correct solution for structured linkage trees.
+    rng = np.random.RandomState(0)
+    mask = np.ones([10, 10], dtype=bool)
+    # Avoiding a mask with only 'True' entries
+    mask[4:7, 4:7] = 0
+    X = rng.randn(50, 100)
+    connectivity = grid_to_graph(*mask.shape)
+    for tree_builder in _TREE_BUILDERS.values():
+        children, n_components, n_leaves, parent = tree_builder(
+            X.T, connectivity=connectivity
+        )
+        n_nodes = 2 * X.shape[1] - 1
+        assert len(children) + n_leaves == n_nodes
+        # Check that ward_tree raises a ValueError with a connectivity matrix
+        # of the wrong shape
+        with pytest.raises(ValueError):
+            tree_builder(X.T, connectivity=np.ones((4, 4)))
+        # Check that fitting with no samples raises an error
+        with pytest.raises(ValueError):
+            tree_builder(X.T[:0], connectivity=connectivity)
+
+
+def test_unstructured_linkage_tree():
+    # Check that we obtain the correct solution for unstructured linkage trees.
+    rng = np.random.RandomState(0)
+    X = rng.randn(50, 100)
+    for this_X in (X, X[0]):
+        # With specified a number of clusters just for the sake of
+        # raising a warning and testing the warning code
+        with ignore_warnings():
+            with pytest.warns(UserWarning):
+                children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10)
+        n_nodes = 2 * X.shape[1] - 1
+        assert len(children) + n_leaves == n_nodes
+
+    for tree_builder in _TREE_BUILDERS.values():
+        for this_X in (X, X[0]):
+            with ignore_warnings():
+                with pytest.warns(UserWarning):
+                    children, n_nodes, n_leaves, parent = tree_builder(
+                        this_X.T, n_clusters=10
+                    )
+            n_nodes = 2 * X.shape[1] - 1
+            assert len(children) + n_leaves == n_nodes
+
+
+def test_height_linkage_tree():
+    # Check that the height of the results of linkage tree is sorted.
+    rng = np.random.RandomState(0)
+    mask = np.ones([10, 10], dtype=bool)
+    X = rng.randn(50, 100)
+    connectivity = grid_to_graph(*mask.shape)
+    for linkage_func in _TREE_BUILDERS.values():
+        children, n_nodes, n_leaves, parent = linkage_func(
+            X.T, connectivity=connectivity
+        )
+        n_nodes = 2 * X.shape[1] - 1
+        assert len(children) + n_leaves == n_nodes
+
+
+def test_zero_cosine_linkage_tree():
+    # Check that zero vectors in X produce an error when
+    # 'cosine' affinity is used
+    X = np.array([[0, 1], [0, 0]])
+    msg = "Cosine affinity cannot be used when X contains zero vectors"
+    with pytest.raises(ValueError, match=msg):
+        linkage_tree(X, affinity="cosine")
+
+
+@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)])
+@pytest.mark.parametrize("compute_distances", [True, False])
+@pytest.mark.parametrize("linkage", ["ward", "complete", "average", "single"])
+def test_agglomerative_clustering_distances(
+    n_clusters, compute_distances, distance_threshold, linkage
+):
+    # Check that when `compute_distances` is True or `distance_threshold` is
+    # given, the fitted model has an attribute `distances_`.
+    rng = np.random.RandomState(0)
+    mask = np.ones([10, 10], dtype=bool)
+    n_samples = 100
+    X = rng.randn(n_samples, 50)
+    connectivity = grid_to_graph(*mask.shape)
+
+    clustering = AgglomerativeClustering(
+        n_clusters=n_clusters,
+        connectivity=connectivity,
+        linkage=linkage,
+        distance_threshold=distance_threshold,
+        compute_distances=compute_distances,
+    )
+    clustering.fit(X)
+    if compute_distances or (distance_threshold is not None):
+        assert hasattr(clustering, "distances_")
+        n_children = clustering.children_.shape[0]
+        n_nodes = n_children + 1
+        assert clustering.distances_.shape == (n_nodes - 1,)
+    else:
+        assert not hasattr(clustering, "distances_")
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_agglomerative_clustering(global_random_seed, lil_container):
+    # Check that we obtain the correct number of clusters with
+    # agglomerative clustering.
+    rng = np.random.RandomState(global_random_seed)
+    mask = np.ones([10, 10], dtype=bool)
+    n_samples = 100
+    X = rng.randn(n_samples, 50)
+    connectivity = grid_to_graph(*mask.shape)
+    for linkage in ("ward", "complete", "average", "single"):
+        clustering = AgglomerativeClustering(
+            n_clusters=10, connectivity=connectivity, linkage=linkage
+        )
+        clustering.fit(X)
+        # test caching
+        try:
+            tempdir = mkdtemp()
+            clustering = AgglomerativeClustering(
+                n_clusters=10,
+                connectivity=connectivity,
+                memory=tempdir,
+                linkage=linkage,
+            )
+            clustering.fit(X)
+            labels = clustering.labels_
+            assert np.size(np.unique(labels)) == 10
+        finally:
+            shutil.rmtree(tempdir)
+        # Turn caching off now
+        clustering = AgglomerativeClustering(
+            n_clusters=10, connectivity=connectivity, linkage=linkage
+        )
+        # Check that we obtain the same solution with early-stopping of the
+        # tree building
+        clustering.compute_full_tree = False
+        clustering.fit(X)
+        assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1)
+        clustering.connectivity = None
+        clustering.fit(X)
+        assert np.size(np.unique(clustering.labels_)) == 10
+        # Check that we raise a TypeError on dense matrices
+        clustering = AgglomerativeClustering(
+            n_clusters=10,
+            connectivity=lil_container(connectivity.toarray()[:10, :10]),
+            linkage=linkage,
+        )
+        with pytest.raises(ValueError):
+            clustering.fit(X)
+
+    # Test that using ward with another metric than euclidean raises an
+    # exception
+    clustering = AgglomerativeClustering(
+        n_clusters=10,
+        connectivity=connectivity.toarray(),
+        metric="manhattan",
+        linkage="ward",
+    )
+    with pytest.raises(ValueError):
+        clustering.fit(X)
+
+    # Test using another metric than euclidean works with linkage complete
+    for metric in PAIRED_DISTANCES.keys():
+        # Compare our (structured) implementation to scipy
+        clustering = AgglomerativeClustering(
+            n_clusters=10,
+            connectivity=np.ones((n_samples, n_samples)),
+            metric=metric,
+            linkage="complete",
+        )
+        clustering.fit(X)
+        clustering2 = AgglomerativeClustering(
+            n_clusters=10, connectivity=None, metric=metric, linkage="complete"
+        )
+        clustering2.fit(X)
+        assert_almost_equal(
+            normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1
+        )
+
+    # Test that using a distance matrix (affinity = 'precomputed') has same
+    # results (with connectivity constraints)
+    clustering = AgglomerativeClustering(
+        n_clusters=10, connectivity=connectivity, linkage="complete"
+    )
+    clustering.fit(X)
+    X_dist = pairwise_distances(X)
+    clustering2 = AgglomerativeClustering(
+        n_clusters=10,
+        connectivity=connectivity,
+        metric="precomputed",
+        linkage="complete",
+    )
+    clustering2.fit(X_dist)
+    assert_array_equal(clustering.labels_, clustering2.labels_)
+
+
+def test_agglomerative_clustering_memory_mapped():
+    """AgglomerativeClustering must work on mem-mapped dataset.
+
+    Non-regression test for issue #19875.
+    """
+    rng = np.random.RandomState(0)
+    Xmm = create_memmap_backed_data(rng.randn(50, 100))
+    AgglomerativeClustering(metric="euclidean", linkage="single").fit(Xmm)
+
+
+def test_ward_agglomeration(global_random_seed):
+    # Check that we obtain the correct solution in a simplistic case
+    rng = np.random.RandomState(global_random_seed)
+    mask = np.ones([10, 10], dtype=bool)
+    X = rng.randn(50, 100)
+    connectivity = grid_to_graph(*mask.shape)
+    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
+    agglo.fit(X)
+    assert np.size(np.unique(agglo.labels_)) == 5
+
+    X_red = agglo.transform(X)
+    assert X_red.shape[1] == 5
+    X_full = agglo.inverse_transform(X_red)
+    assert np.unique(X_full[0]).size == 5
+    assert_array_almost_equal(agglo.transform(X_full), X_red)
+
+    # Check that fitting with no samples raises a ValueError
+    with pytest.raises(ValueError):
+        agglo.fit(X[:0])
+
+
+def test_single_linkage_clustering():
+    # Check that we get the correct result in two emblematic cases
+    moons, moon_labels = make_moons(noise=0.05, random_state=42)
+    clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
+    clustering.fit(moons)
+    assert_almost_equal(
+        normalized_mutual_info_score(clustering.labels_, moon_labels), 1
+    )
+
+    circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42)
+    clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
+    clustering.fit(circles)
+    assert_almost_equal(
+        normalized_mutual_info_score(clustering.labels_, circle_labels), 1
+    )
+
+
+def assess_same_labelling(cut1, cut2):
+    """Util for comparison with scipy"""
+    co_clust = []
+    for cut in [cut1, cut2]:
+        n = len(cut)
+        k = cut.max() + 1
+        ecut = np.zeros((n, k))
+        ecut[np.arange(n), cut] = 1
+        co_clust.append(np.dot(ecut, ecut.T))
+    assert (co_clust[0] == co_clust[1]).all()
+
+
+def test_sparse_scikit_vs_scipy(global_random_seed):
+    # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
+    n, p, k = 10, 5, 3
+    rng = np.random.RandomState(global_random_seed)
+
+    # Not using a lil_matrix here, just to check that non sparse
+    # matrices are well handled
+    connectivity = np.ones((n, n))
+    for linkage in _TREE_BUILDERS.keys():
+        for i in range(5):
+            X = 0.1 * rng.normal(size=(n, p))
+            X -= 4.0 * np.arange(n)[:, np.newaxis]
+            X -= X.mean(axis=1)[:, np.newaxis]
+
+            out = hierarchy.linkage(X, method=linkage)
+
+            children_ = out[:, :2].astype(int, copy=False)
+            children, _, n_leaves, _ = _TREE_BUILDERS[linkage](
+                X, connectivity=connectivity
+            )
+
+            # Sort the order of child nodes per row for consistency
+            children.sort(axis=1)
+            assert_array_equal(
+                children,
+                children_,
+                "linkage tree differs from scipy impl for linkage: " + linkage,
+            )
+
+            cut = _hc_cut(k, children, n_leaves)
+            cut_ = _hc_cut(k, children_, n_leaves)
+            assess_same_labelling(cut, cut_)
+
+    # Test error management in _hc_cut
+    with pytest.raises(ValueError):
+        _hc_cut(n_leaves + 1, children, n_leaves)
+
+
+# Make sure our custom mst_linkage_core gives
+# the same results as scipy's builtin
+def test_vector_scikit_single_vs_scipy_single(global_random_seed):
+    n_samples, n_features, n_clusters = 10, 5, 3
+    rng = np.random.RandomState(global_random_seed)
+    X = 0.1 * rng.normal(size=(n_samples, n_features))
+    X -= 4.0 * np.arange(n_samples)[:, np.newaxis]
+    X -= X.mean(axis=1)[:, np.newaxis]
+
+    out = hierarchy.linkage(X, method="single")
+    children_scipy = out[:, :2].astype(int)
+
+    children, _, n_leaves, _ = _TREE_BUILDERS["single"](X)
+
+    # Sort the order of child nodes per row for consistency
+    children.sort(axis=1)
+    assert_array_equal(
+        children,
+        children_scipy,
+        "linkage tree differs from scipy impl for single linkage.",
+    )
+
+    cut = _hc_cut(n_clusters, children, n_leaves)
+    cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
+    assess_same_labelling(cut, cut_scipy)
+
+
+@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
+def test_mst_linkage_core_memory_mapped(metric_param_grid):
+    """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
+
+    Non-regression test for issue #19875.
+    """
+    rng = np.random.RandomState(seed=1)
+    X = rng.normal(size=(20, 4))
+    Xmm = create_memmap_backed_data(X)
+    metric, param_grid = metric_param_grid
+    keys = param_grid.keys()
+    for vals in itertools.product(*param_grid.values()):
+        kwargs = dict(zip(keys, vals))
+        distance_metric = DistanceMetric.get_metric(metric, **kwargs)
+        mst = mst_linkage_core(X, distance_metric)
+        mst_mm = mst_linkage_core(Xmm, distance_metric)
+        np.testing.assert_equal(mst, mst_mm)
+
+
+def test_identical_points():
+    # Ensure identical points are handled correctly when using mst with
+    # a sparse connectivity matrix
+    X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]])
+    true_labels = np.array([0, 0, 1, 1, 2, 2])
+    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
+    connectivity = 0.5 * (connectivity + connectivity.T)
+    connectivity, n_components = _fix_connectivity(X, connectivity, "euclidean")
+
+    for linkage in ("single", "average", "average", "ward"):
+        clustering = AgglomerativeClustering(
+            n_clusters=3, linkage=linkage, connectivity=connectivity
+        )
+        clustering.fit(X)
+
+        assert_almost_equal(
+            normalized_mutual_info_score(clustering.labels_, true_labels), 1
+        )
+
+
+def test_connectivity_propagation():
+    # Check that connectivity in the ward tree is propagated correctly during
+    # merging.
+    X = np.array(
+        [
+            (0.014, 0.120),
+            (0.014, 0.099),
+            (0.014, 0.097),
+            (0.017, 0.153),
+            (0.017, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.153),
+            (0.018, 0.152),
+            (0.018, 0.149),
+            (0.018, 0.144),
+        ]
+    )
+    connectivity = kneighbors_graph(X, 10, include_self=False)
+    ward = AgglomerativeClustering(
+        n_clusters=4, connectivity=connectivity, linkage="ward"
+    )
+    # If changes are not propagated correctly, fit crashes with an
+    # IndexError
+    ward.fit(X)
+
+
+def test_ward_tree_children_order(global_random_seed):
+    # Check that children are ordered in the same way for both structured and
+    # unstructured versions of ward_tree.
+
+    # test on five random datasets
+    n, p = 10, 5
+    rng = np.random.RandomState(global_random_seed)
+
+    connectivity = np.ones((n, n))
+    for i in range(5):
+        X = 0.1 * rng.normal(size=(n, p))
+        X -= 4.0 * np.arange(n)[:, np.newaxis]
+        X -= X.mean(axis=1)[:, np.newaxis]
+
+        out_unstructured = ward_tree(X)
+        out_structured = ward_tree(X, connectivity=connectivity)
+
+        assert_array_equal(out_unstructured[0], out_structured[0])
+
+
+def test_ward_linkage_tree_return_distance(global_random_seed):
+    # Test return_distance option on linkage and ward trees
+
+    # test that return_distance when set true, gives same
+    # output on both structured and unstructured clustering.
+    n, p = 10, 5
+    rng = np.random.RandomState(global_random_seed)
+
+    connectivity = np.ones((n, n))
+    for i in range(5):
+        X = 0.1 * rng.normal(size=(n, p))
+        X -= 4.0 * np.arange(n)[:, np.newaxis]
+        X -= X.mean(axis=1)[:, np.newaxis]
+
+        out_unstructured = ward_tree(X, return_distance=True)
+        out_structured = ward_tree(X, connectivity=connectivity, return_distance=True)
+
+        # get children
+        children_unstructured = out_unstructured[0]
+        children_structured = out_structured[0]
+
+        # check if we got the same clusters
+        assert_array_equal(children_unstructured, children_structured)
+
+        # check if the distances are the same
+        dist_unstructured = out_unstructured[-1]
+        dist_structured = out_structured[-1]
+
+        assert_array_almost_equal(dist_unstructured, dist_structured)
+
+        for linkage in ["average", "complete", "single"]:
+            structured_items = linkage_tree(
+                X, connectivity=connectivity, linkage=linkage, return_distance=True
+            )[-1]
+            unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[
+                -1
+            ]
+            structured_dist = structured_items[-1]
+            unstructured_dist = unstructured_items[-1]
+            structured_children = structured_items[0]
+            unstructured_children = unstructured_items[0]
+            assert_array_almost_equal(structured_dist, unstructured_dist)
+            assert_array_almost_equal(structured_children, unstructured_children)
+
+    # test on the following dataset where we know the truth
+    # taken from scipy/cluster/tests/hierarchy_test_data.py
+    X = np.array(
+        [
+            [1.43054825, -7.5693489],
+            [6.95887839, 6.82293382],
+            [2.87137846, -9.68248579],
+            [7.87974764, -6.05485803],
+            [8.24018364, -6.09495602],
+            [7.39020262, 8.54004355],
+        ]
+    )
+    # truth
+    linkage_X_ward = np.array(
+        [
+            [3.0, 4.0, 0.36265956, 2.0],
+            [1.0, 5.0, 1.77045373, 2.0],
+            [0.0, 2.0, 2.55760419, 2.0],
+            [6.0, 8.0, 9.10208346, 4.0],
+            [7.0, 9.0, 24.7784379, 6.0],
+        ]
+    )
+
+    linkage_X_complete = np.array(
+        [
+            [3.0, 4.0, 0.36265956, 2.0],
+            [1.0, 5.0, 1.77045373, 2.0],
+            [0.0, 2.0, 2.55760419, 2.0],
+            [6.0, 8.0, 6.96742194, 4.0],
+            [7.0, 9.0, 18.77445997, 6.0],
+        ]
+    )
+
+    linkage_X_average = np.array(
+        [
+            [3.0, 4.0, 0.36265956, 2.0],
+            [1.0, 5.0, 1.77045373, 2.0],
+            [0.0, 2.0, 2.55760419, 2.0],
+            [6.0, 8.0, 6.55832839, 4.0],
+            [7.0, 9.0, 15.44089605, 6.0],
+        ]
+    )
+
+    n_samples, n_features = np.shape(X)
+    connectivity_X = np.ones((n_samples, n_samples))
+
+    out_X_unstructured = ward_tree(X, return_distance=True)
+    out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True)
+
+    # check that the labels are the same
+    assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
+    assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])
+
+    # check that the distances are correct
+    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
+    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
+
+    linkage_options = ["complete", "average", "single"]
+    X_linkage_truth = [linkage_X_complete, linkage_X_average]
+    for linkage, X_truth in zip(linkage_options, X_linkage_truth):
+        out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage)
+        out_X_structured = linkage_tree(
+            X, connectivity=connectivity_X, linkage=linkage, return_distance=True
+        )
+
+        # check that the labels are the same
+        assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
+        assert_array_equal(X_truth[:, :2], out_X_structured[0])
+
+        # check that the distances are correct
+        assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
+        assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
+
+
+def test_connectivity_fixing_non_lil():
+    # Check non regression of a bug if a non item assignable connectivity is
+    # provided with more than one component.
+    # create dummy data
+    x = np.array([[0, 0], [1, 1]])
+    # create a mask with several components to force connectivity fixing
+    m = np.array([[True, False], [False, True]])
+    c = grid_to_graph(n_x=2, n_y=2, mask=m)
+    w = AgglomerativeClustering(connectivity=c, linkage="ward")
+    with pytest.warns(UserWarning):
+        w.fit(x)
+
+
+def test_int_float_dict():
+    rng = np.random.RandomState(0)
+    keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
+    values = rng.rand(len(keys))
+
+    d = IntFloatDict(keys, values)
+    for key, value in zip(keys, values):
+        assert d[key] == value
+
+    other_keys = np.arange(50, dtype=np.intp)[::2]
+    other_values = np.full(50, 0.5)[::2]
+    other = IntFloatDict(other_keys, other_values)
+    # Complete smoke test
+    max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
+    average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
+
+
+def test_connectivity_callable():
+    rng = np.random.RandomState(0)
+    X = rng.rand(20, 5)
+    connectivity = kneighbors_graph(X, 3, include_self=False)
+    aglc1 = AgglomerativeClustering(connectivity=connectivity)
+    aglc2 = AgglomerativeClustering(
+        connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)
+    )
+    aglc1.fit(X)
+    aglc2.fit(X)
+    assert_array_equal(aglc1.labels_, aglc2.labels_)
+
+
+def test_connectivity_ignores_diagonal():
+    rng = np.random.RandomState(0)
+    X = rng.rand(20, 5)
+    connectivity = kneighbors_graph(X, 3, include_self=False)
+    connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
+    aglc1 = AgglomerativeClustering(connectivity=connectivity)
+    aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
+    aglc1.fit(X)
+    aglc2.fit(X)
+    assert_array_equal(aglc1.labels_, aglc2.labels_)
+
+
+def test_compute_full_tree():
+    # Test that the full tree is computed if n_clusters is small
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 2)
+    connectivity = kneighbors_graph(X, 5, include_self=False)
+
+    # When n_clusters is less, the full tree should be built
+    # that is the number of merges should be n_samples - 1
+    agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
+    agc.fit(X)
+    n_samples = X.shape[0]
+    n_nodes = agc.children_.shape[0]
+    assert n_nodes == n_samples - 1
+
+    # When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
+    # we should stop when there are n_clusters.
+    n_clusters = 101
+    X = rng.randn(200, 2)
+    connectivity = kneighbors_graph(X, 10, include_self=False)
+    agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity)
+    agc.fit(X)
+    n_samples = X.shape[0]
+    n_nodes = agc.children_.shape[0]
+    assert n_nodes == n_samples - n_clusters
+
+
+def test_n_components():
+    # Test n_components returned by linkage, average and ward tree
+    rng = np.random.RandomState(0)
+    X = rng.rand(5, 5)
+
+    # Connectivity matrix having five components.
+    connectivity = np.eye(5)
+
+    for linkage_func in _TREE_BUILDERS.values():
+        assert ignore_warnings(linkage_func)(X, connectivity=connectivity)[1] == 5
+
+
+def test_affinity_passed_to_fix_connectivity():
+    # Test that the affinity parameter is actually passed to the pairwise
+    # function
+
+    size = 2
+    rng = np.random.RandomState(0)
+    X = rng.randn(size, size)
+    mask = np.array([True, False, False, True])
+
+    connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
+
+    class FakeAffinity:
+        def __init__(self):
+            self.counter = 0
+
+        def increment(self, *args, **kwargs):
+            self.counter += 1
+            return self.counter
+
+    fa = FakeAffinity()
+
+    linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
+
+    assert fa.counter == 3
+
+
+@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
+def test_agglomerative_clustering_with_distance_threshold(linkage, global_random_seed):
+    # Check that we obtain the correct number of clusters with
+    # agglomerative clustering with distance_threshold.
+    rng = np.random.RandomState(global_random_seed)
+    mask = np.ones([10, 10], dtype=bool)
+    n_samples = 100
+    X = rng.randn(n_samples, 50)
+    connectivity = grid_to_graph(*mask.shape)
+    # test when distance threshold is set to 10
+    distance_threshold = 10
+    for conn in [None, connectivity]:
+        clustering = AgglomerativeClustering(
+            n_clusters=None,
+            distance_threshold=distance_threshold,
+            connectivity=conn,
+            linkage=linkage,
+        )
+        clustering.fit(X)
+        clusters_produced = clustering.labels_
+        num_clusters_produced = len(np.unique(clustering.labels_))
+        # test if the clusters produced match the point in the linkage tree
+        # where the distance exceeds the threshold
+        tree_builder = _TREE_BUILDERS[linkage]
+        children, n_components, n_leaves, parent, distances = tree_builder(
+            X, connectivity=conn, n_clusters=None, return_distance=True
+        )
+        num_clusters_at_threshold = (
+            np.count_nonzero(distances >= distance_threshold) + 1
+        )
+        # test number of clusters produced
+        assert num_clusters_at_threshold == num_clusters_produced
+        # test clusters produced
+        clusters_at_threshold = _hc_cut(
+            n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves
+        )
+        assert np.array_equiv(clusters_produced, clusters_at_threshold)
+
+
+def test_small_distance_threshold(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 10
+    X = rng.randint(-300, 300, size=(n_samples, 3))
+    # this should result in all data in their own clusters, given that
+    # their pairwise distances are bigger than .1 (which may not be the case
+    # with a different random seed).
+    clustering = AgglomerativeClustering(
+        n_clusters=None, distance_threshold=1.0, linkage="single"
+    ).fit(X)
+    # check that the pairwise distances are indeed all larger than .1
+    all_distances = pairwise_distances(X, metric="minkowski", p=2)
+    np.fill_diagonal(all_distances, np.inf)
+    assert np.all(all_distances > 0.1)
+    assert clustering.n_clusters_ == n_samples
+
+
+def test_cluster_distances_with_distance_threshold(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 100
+    X = rng.randint(-10, 10, size=(n_samples, 3))
+    # check the distances within the clusters and with other clusters
+    distance_threshold = 4
+    clustering = AgglomerativeClustering(
+        n_clusters=None, distance_threshold=distance_threshold, linkage="single"
+    ).fit(X)
+    labels = clustering.labels_
+    D = pairwise_distances(X, metric="minkowski", p=2)
+    # to avoid taking the 0 diagonal in min()
+    np.fill_diagonal(D, np.inf)
+    for label in np.unique(labels):
+        in_cluster_mask = labels == label
+        max_in_cluster_distance = (
+            D[in_cluster_mask][:, in_cluster_mask].min(axis=0).max()
+        )
+        min_out_cluster_distance = (
+            D[in_cluster_mask][:, ~in_cluster_mask].min(axis=0).min()
+        )
+        # single data point clusters only have that inf diagonal here
+        if in_cluster_mask.sum() > 1:
+            assert max_in_cluster_distance < distance_threshold
+        assert min_out_cluster_distance >= distance_threshold
+
+
+@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
+@pytest.mark.parametrize(
+    ("threshold", "y_true"), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])]
+)
+def test_agglomerative_clustering_with_distance_threshold_edge_case(
+    linkage, threshold, y_true
+):
+    # test boundary case of distance_threshold matching the distance
+    X = [[0], [1]]
+    clusterer = AgglomerativeClustering(
+        n_clusters=None, distance_threshold=threshold, linkage=linkage
+    )
+    y_pred = clusterer.fit_predict(X)
+    assert adjusted_rand_score(y_true, y_pred) == 1
+
+
+def test_dist_threshold_invalid_parameters():
+    X = [[0], [1]]
+    with pytest.raises(ValueError, match="Exactly one of "):
+        AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X)
+
+    with pytest.raises(ValueError, match="Exactly one of "):
+        AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X)
+
+    X = [[0], [1]]
+    with pytest.raises(ValueError, match="compute_full_tree must be True if"):
+        AgglomerativeClustering(
+            n_clusters=None, distance_threshold=1, compute_full_tree=False
+        ).fit(X)
+
+
+def test_invalid_shape_precomputed_dist_matrix():
+    # Check that an error is raised when affinity='precomputed'
+    # and a non square matrix is passed (PR #16257).
+    rng = np.random.RandomState(0)
+    X = rng.rand(5, 3)
+    with pytest.raises(
+        ValueError,
+        match=r"Distance matrix should be square, got matrix of shape \(5, 3\)",
+    ):
+        AgglomerativeClustering(metric="precomputed", linkage="complete").fit(X)
+
+
+def test_precomputed_connectivity_metric_with_2_connected_components():
+    """Check that connecting components works when connectivity and
+    affinity are both precomputed and the number of connected components is
+    greater than 1. Non-regression test for #16151.
+    """
+
+    connectivity_matrix = np.array(
+        [
+            [0, 1, 1, 0, 0],
+            [0, 0, 1, 0, 0],
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 1],
+            [0, 0, 0, 0, 0],
+        ]
+    )
+    # ensure that connectivity_matrix has two connected components
+    assert connected_components(connectivity_matrix)[0] == 2
+
+    rng = np.random.RandomState(0)
+    X = rng.randn(5, 10)
+
+    X_dist = pairwise_distances(X)
+    clusterer_precomputed = AgglomerativeClustering(
+        metric="precomputed", connectivity=connectivity_matrix, linkage="complete"
+    )
+    msg = "Completing it to avoid stopping the tree early"
+    with pytest.warns(UserWarning, match=msg):
+        clusterer_precomputed.fit(X_dist)
+
+    clusterer = AgglomerativeClustering(
+        connectivity=connectivity_matrix, linkage="complete"
+    )
+    with pytest.warns(UserWarning, match=msg):
+        clusterer.fit(X)
+
+    assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_)
+    assert_array_equal(clusterer.children_, clusterer_precomputed.children_)
+
+
+# TODO(1.6): remove in 1.6
+@pytest.mark.parametrize(
+    "Agglomeration", [AgglomerativeClustering, FeatureAgglomeration]
+)
+def test_deprecation_warning_metric_None(Agglomeration):
+    X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
+    warn_msg = "`metric=None` is deprecated in version 1.4 and will be removed"
+    with pytest.warns(FutureWarning, match=warn_msg):
+        Agglomeration(metric=None).fit(X)
@@ -0,0 +1,215 @@
+"""
+Testing for mean shift clustering methods
+
+"""
+
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.cluster import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift
+from sklearn.datasets import make_blobs
+from sklearn.metrics import v_measure_score
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+
+n_clusters = 3
+centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
+X, _ = make_blobs(
+    n_samples=300,
+    n_features=2,
+    centers=centers,
+    cluster_std=0.4,
+    shuffle=True,
+    random_state=11,
+)
+
+
+def test_convergence_of_1d_constant_data():
+    # Test convergence using 1D constant data
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/28926
+    model = MeanShift()
+    n_iter = model.fit(np.ones(10).reshape(-1, 1)).n_iter_
+    assert n_iter < model.max_iter
+
+
+def test_estimate_bandwidth():
+    # Test estimate_bandwidth
+    bandwidth = estimate_bandwidth(X, n_samples=200)
+    assert 0.9 <= bandwidth <= 1.5
+
+
+def test_estimate_bandwidth_1sample(global_dtype):
+    # Test estimate_bandwidth when n_samples=1 and quantile<1, so that
+    # n_neighbors is set to 1.
+    bandwidth = estimate_bandwidth(
+        X.astype(global_dtype, copy=False), n_samples=1, quantile=0.3
+    )
+
+    assert bandwidth.dtype == X.dtype
+    assert bandwidth == pytest.approx(0.0, abs=1e-5)
+
+
+@pytest.mark.parametrize(
+    "bandwidth, cluster_all, expected, first_cluster_label",
+    [(1.2, True, 3, 0), (1.2, False, 4, -1)],
+)
+def test_mean_shift(
+    global_dtype, bandwidth, cluster_all, expected, first_cluster_label
+):
+    # Test MeanShift algorithm
+    X_with_global_dtype = X.astype(global_dtype, copy=False)
+    ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all)
+    labels = ms.fit(X_with_global_dtype).labels_
+    labels_unique = np.unique(labels)
+    n_clusters_ = len(labels_unique)
+    assert n_clusters_ == expected
+    assert labels_unique[0] == first_cluster_label
+    assert ms.cluster_centers_.dtype == global_dtype
+
+    cluster_centers, labels_mean_shift = mean_shift(
+        X_with_global_dtype, cluster_all=cluster_all
+    )
+    labels_mean_shift_unique = np.unique(labels_mean_shift)
+    n_clusters_mean_shift = len(labels_mean_shift_unique)
+    assert n_clusters_mean_shift == expected
+    assert labels_mean_shift_unique[0] == first_cluster_label
+    assert cluster_centers.dtype == global_dtype
+
+
+def test_parallel(global_dtype):
+    centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
+    X, _ = make_blobs(
+        n_samples=50,
+        n_features=2,
+        centers=centers,
+        cluster_std=0.4,
+        shuffle=True,
+        random_state=11,
+    )
+
+    X = X.astype(global_dtype, copy=False)
+
+    ms1 = MeanShift(n_jobs=2)
+    ms1.fit(X)
+
+    ms2 = MeanShift()
+    ms2.fit(X)
+
+    assert_allclose(ms1.cluster_centers_, ms2.cluster_centers_)
+    assert ms1.cluster_centers_.dtype == ms2.cluster_centers_.dtype
+    assert_array_equal(ms1.labels_, ms2.labels_)
+
+
+def test_meanshift_predict(global_dtype):
+    # Test MeanShift.predict
+    ms = MeanShift(bandwidth=1.2)
+    X_with_global_dtype = X.astype(global_dtype, copy=False)
+    labels = ms.fit_predict(X_with_global_dtype)
+    labels2 = ms.predict(X_with_global_dtype)
+    assert_array_equal(labels, labels2)
+
+
+def test_meanshift_all_orphans():
+    # init away from the data, crash with a sensible warning
+    ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
+    msg = "No point was within bandwidth=0.1"
+    with pytest.raises(ValueError, match=msg):
+        ms.fit(
+            X,
+        )
+
+
+def test_unfitted():
+    # Non-regression: before fit, there should be not fitted attributes.
+    ms = MeanShift()
+    assert not hasattr(ms, "cluster_centers_")
+    assert not hasattr(ms, "labels_")
+
+
+def test_cluster_intensity_tie(global_dtype):
+    X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=global_dtype)
+    c1 = MeanShift(bandwidth=2).fit(X)
+
+    X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=global_dtype)
+    c2 = MeanShift(bandwidth=2).fit(X)
+    assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0])
+    assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
+
+
+def test_bin_seeds(global_dtype):
+    # Test the bin seeding technique which can be used in the mean shift
+    # algorithm
+    # Data is just 6 points in the plane
+    X = np.array(
+        [[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]],
+        dtype=global_dtype,
+    )
+
+    # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
+    # found
+    ground_truth = {(1.0, 1.0), (2.0, 1.0), (0.0, 0.0)}
+    test_bins = get_bin_seeds(X, 1, 1)
+    test_result = set(tuple(p) for p in test_bins)
+    assert len(ground_truth.symmetric_difference(test_result)) == 0
+
+    # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
+    # found
+    ground_truth = {(1.0, 1.0), (2.0, 1.0)}
+    test_bins = get_bin_seeds(X, 1, 2)
+    test_result = set(tuple(p) for p in test_bins)
+    assert len(ground_truth.symmetric_difference(test_result)) == 0
+
+    # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
+    # we bail and use the whole data here.
+    with warnings.catch_warnings(record=True):
+        test_bins = get_bin_seeds(X, 0.01, 1)
+    assert_allclose(test_bins, X)
+
+    # tight clusters around [0, 0] and [1, 1], only get two bins
+    X, _ = make_blobs(
+        n_samples=100,
+        n_features=2,
+        centers=[[0, 0], [1, 1]],
+        cluster_std=0.1,
+        random_state=0,
+    )
+    X = X.astype(global_dtype, copy=False)
+    test_bins = get_bin_seeds(X, 1)
+    assert_array_equal(test_bins, [[0, 0], [1, 1]])
+
+
+@pytest.mark.parametrize("max_iter", [1, 100])
+def test_max_iter(max_iter):
+    clusters1, _ = mean_shift(X, max_iter=max_iter)
+    ms = MeanShift(max_iter=max_iter).fit(X)
+    clusters2 = ms.cluster_centers_
+
+    assert ms.n_iter_ <= ms.max_iter
+    assert len(clusters1) == len(clusters2)
+
+    for c1, c2 in zip(clusters1, clusters2):
+        assert np.allclose(c1, c2)
+
+
+def test_mean_shift_zero_bandwidth(global_dtype):
+    # Check that mean shift works when the estimated bandwidth is 0.
+    X = np.array([1, 1, 1, 2, 2, 2, 3, 3], dtype=global_dtype).reshape(-1, 1)
+
+    # estimate_bandwidth with default args returns 0 on this dataset
+    bandwidth = estimate_bandwidth(X)
+    assert bandwidth == 0
+
+    # get_bin_seeds with a 0 bin_size should return the dataset itself
+    assert get_bin_seeds(X, bin_size=bandwidth) is X
+
+    # MeanShift with binning and a 0 estimated bandwidth should be equivalent
+    # to no binning.
+    ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X)
+    ms_nobinning = MeanShift(bin_seeding=False).fit(X)
+    expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2])
+
+    assert v_measure_score(ms_binning.labels_, expected_labels) == pytest.approx(1)
+    assert v_measure_score(ms_nobinning.labels_, expected_labels) == pytest.approx(1)
+    assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)
@@ -0,0 +1,858 @@
+# Authors: Shane Grigsby <refuge@rocktalus.com>
+#          Adrin Jalali <adrin.jalali@gmail.com>
+# License: BSD 3 clause
+import warnings
+
+import numpy as np
+import pytest
+
+from sklearn.cluster import DBSCAN, OPTICS
+from sklearn.cluster._optics import _extend_region, _extract_xi_labels
+from sklearn.cluster.tests.common import generate_clustered_data
+from sklearn.datasets import make_blobs
+from sklearn.exceptions import DataConversionWarning, EfficiencyWarning
+from sklearn.metrics.cluster import contingency_matrix
+from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.utils import shuffle
+from sklearn.utils._testing import assert_allclose, assert_array_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+rng = np.random.RandomState(0)
+n_points_per_cluster = 10
+C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
+C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
+C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
+C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
+C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
+C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
+X = np.vstack((C1, C2, C3, C4, C5, C6))
+
+
+@pytest.mark.parametrize(
+    ("r_plot", "end"),
+    [
+        [[10, 8.9, 8.8, 8.7, 7, 10], 3],
+        [[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0],
+        [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
+        [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
+    ],
+)
+def test_extend_downward(r_plot, end):
+    r_plot = np.array(r_plot)
+    ratio = r_plot[:-1] / r_plot[1:]
+    steep_downward = ratio >= 1 / 0.9
+    upward = ratio < 1
+
+    e = _extend_region(steep_downward, upward, 0, 2)
+    assert e == end
+
+
+@pytest.mark.parametrize(
+    ("r_plot", "end"),
+    [
+        [[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6],
+        [[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0],
+        [[1, 2, 2.1, 2, np.inf], 0],
+        [[1, 2, 2.1, np.inf], 2],
+    ],
+)
+def test_extend_upward(r_plot, end):
+    r_plot = np.array(r_plot)
+    ratio = r_plot[:-1] / r_plot[1:]
+    steep_upward = ratio <= 0.9
+    downward = ratio > 1
+
+    e = _extend_region(steep_upward, downward, 0, 2)
+    assert e == end
+
+
+@pytest.mark.parametrize(
+    ("ordering", "clusters", "expected"),
+    [
+        [[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]],
+        [[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]],
+        [[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]],
+        [[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]],
+    ],
+)
+def test_the_extract_xi_labels(ordering, clusters, expected):
+    labels = _extract_xi_labels(ordering, clusters)
+
+    assert_array_equal(labels, expected)
+
+
+def test_extract_xi(global_dtype):
+    # small and easy test (no clusters around other clusters)
+    # but with a clear noise data.
+    rng = np.random.RandomState(0)
+    n_points_per_cluster = 5
+
+    C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
+    C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
+    C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
+    C5 = [3, -2] + 0.6 * rng.randn(n_points_per_cluster, 2)
+    C6 = [5, 6] + 0.2 * rng.randn(n_points_per_cluster, 2)
+
+    X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)).astype(
+        global_dtype, copy=False
+    )
+    expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5]
+    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+
+    clust = OPTICS(
+        min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4
+    ).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+    # check float min_samples and min_cluster_size
+    clust = OPTICS(
+        min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4
+    ).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+    X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)).astype(
+        global_dtype, copy=False
+    )
+    expected_labels = np.r_[
+        [1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5
+    ]
+    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+
+    clust = OPTICS(
+        min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3
+    ).fit(X)
+    # this may fail if the predecessor correction is not at work!
+    assert_array_equal(clust.labels_, expected_labels)
+
+    C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]]
+    C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
+    C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
+    X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
+    expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
+    X, expected_labels = shuffle(X, expected_labels, random_state=rng)
+
+    clust = OPTICS(
+        min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04
+    ).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+
+def test_cluster_hierarchy_(global_dtype):
+    rng = np.random.RandomState(0)
+    n_points_per_cluster = 100
+    C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2).astype(
+        global_dtype, copy=False
+    )
+    C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2).astype(
+        global_dtype, copy=False
+    )
+    X = np.vstack((C1, C2))
+    X = shuffle(X, random_state=0)
+
+    clusters = OPTICS(min_samples=20, xi=0.1).fit(X).cluster_hierarchy_
+    assert clusters.shape == (2, 2)
+    diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
+    assert diff / len(X) < 0.05
+
+
+@pytest.mark.parametrize(
+    "csr_container, metric",
+    [(None, "minkowski")] + [(container, "euclidean") for container in CSR_CONTAINERS],
+)
+def test_correct_number_of_clusters(metric, csr_container):
+    # in 'auto' mode
+
+    n_clusters = 3
+    X = generate_clustered_data(n_clusters=n_clusters)
+    # Parameters chosen specifically for this task.
+    # Compute OPTICS
+    clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric)
+    clust.fit(csr_container(X) if csr_container is not None else X)
+    # number of clusters, ignoring noise if present
+    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
+    assert n_clusters_1 == n_clusters
+
+    # check attribute types and sizes
+    assert clust.labels_.shape == (len(X),)
+    assert clust.labels_.dtype.kind == "i"
+
+    assert clust.reachability_.shape == (len(X),)
+    assert clust.reachability_.dtype.kind == "f"
+
+    assert clust.core_distances_.shape == (len(X),)
+    assert clust.core_distances_.dtype.kind == "f"
+
+    assert clust.ordering_.shape == (len(X),)
+    assert clust.ordering_.dtype.kind == "i"
+    assert set(clust.ordering_) == set(range(len(X)))
+
+
+def test_minimum_number_of_sample_check():
+    # test that we check a minimum number of samples
+    msg = "min_samples must be no greater than"
+
+    # Compute OPTICS
+    X = [[1, 1]]
+    clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1.0)
+
+    # Run the fit
+    with pytest.raises(ValueError, match=msg):
+        clust.fit(X)
+
+
+def test_bad_extract():
+    # Test an extraction of eps too close to original eps
+    msg = "Specify an epsilon smaller than 0.15. Got 0.3."
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=750, centers=centers, cluster_std=0.4, random_state=0
+    )
+
+    # Compute OPTICS
+    clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10)
+    with pytest.raises(ValueError, match=msg):
+        clust.fit(X)
+
+
+def test_bad_reachability():
+    msg = "All reachability values are inf. Set a larger max_eps."
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=750, centers=centers, cluster_std=0.4, random_state=0
+    )
+
+    with pytest.warns(UserWarning, match=msg):
+        clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
+        clust.fit(X)
+
+
+def test_nowarn_if_metric_bool_data_bool():
+    # make sure no warning is raised if metric and data are both boolean
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/18996
+
+    pairwise_metric = "rogerstanimoto"
+    X = np.random.randint(2, size=(5, 2), dtype=bool)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", DataConversionWarning)
+
+        OPTICS(metric=pairwise_metric).fit(X)
+
+
+def test_warn_if_metric_bool_data_no_bool():
+    # make sure a *single* conversion warning is raised if metric is boolean
+    # but data isn't
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/18996
+
+    pairwise_metric = "rogerstanimoto"
+    X = np.random.randint(2, size=(5, 2), dtype=np.int32)
+    msg = f"Data will be converted to boolean for metric {pairwise_metric}"
+
+    with pytest.warns(DataConversionWarning, match=msg) as warn_record:
+        OPTICS(metric=pairwise_metric).fit(X)
+        assert len(warn_record) == 1
+
+
+def test_nowarn_if_metric_no_bool():
+    # make sure no conversion warning is raised if
+    # metric isn't boolean, no matter what the data type is
+    pairwise_metric = "minkowski"
+    X_bool = np.random.randint(2, size=(5, 2), dtype=bool)
+    X_num = np.random.randint(2, size=(5, 2), dtype=np.int32)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", DataConversionWarning)
+
+        # fit boolean data
+        OPTICS(metric=pairwise_metric).fit(X_bool)
+        # fit numeric data
+        OPTICS(metric=pairwise_metric).fit(X_num)
+
+
+def test_close_extract():
+    # Test extract where extraction eps is close to scaled max_eps
+
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=750, centers=centers, cluster_std=0.4, random_state=0
+    )
+
+    # Compute OPTICS
+    clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X)
+    # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
+    assert max(clust.labels_) == 2
+
+
+@pytest.mark.parametrize("eps", [0.1, 0.3, 0.5])
+@pytest.mark.parametrize("min_samples", [3, 10, 20])
+@pytest.mark.parametrize(
+    "csr_container, metric",
+    [(None, "minkowski"), (None, "euclidean")]
+    + [(container, "euclidean") for container in CSR_CONTAINERS],
+)
+def test_dbscan_optics_parity(eps, min_samples, metric, global_dtype, csr_container):
+    # Test that OPTICS clustering labels are <= 5% difference of DBSCAN
+
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(
+        n_samples=150, centers=centers, cluster_std=0.4, random_state=0
+    )
+    X = csr_container(X) if csr_container is not None else X
+
+    X = X.astype(global_dtype, copy=False)
+
+    # calculate optics with dbscan extract at 0.3 epsilon
+    op = OPTICS(
+        min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric
+    ).fit(X)
+
+    # calculate dbscan labels
+    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
+
+    contingency = contingency_matrix(db.labels_, op.labels_)
+    agree = min(
+        np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))
+    )
+    disagree = X.shape[0] - agree
+
+    percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
+
+    # verify label mismatch is <= 5% labels
+    assert percent_mismatch <= 0.05
+
+
+def test_min_samples_edge_case(global_dtype):
+    C1 = [[0, 0], [0, 0.1], [0, -0.1]]
+    C2 = [[10, 10], [10, 9], [10, 11]]
+    C3 = [[100, 100], [100, 96], [100, 106]]
+    X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
+
+    expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
+    clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+    expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
+    clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
+    assert_array_equal(clust.labels_, expected_labels)
+
+    expected_labels = np.r_[[-1] * 9]
+    with pytest.warns(UserWarning, match="All reachability values"):
+        clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
+        assert_array_equal(clust.labels_, expected_labels)
+
+
+# try arbitrary minimum sizes
+@pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23))
+def test_min_cluster_size(min_cluster_size, global_dtype):
+    redX = X[::2].astype(global_dtype, copy=False)  # reduce for speed
+    clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
+    cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
+    if cluster_sizes.size:
+        assert min(cluster_sizes) >= min_cluster_size
+    # check behaviour is the same when min_cluster_size is a fraction
+    clust_frac = OPTICS(
+        min_samples=9,
+        min_cluster_size=min_cluster_size / redX.shape[0],
+    )
+    clust_frac.fit(redX)
+    assert_array_equal(clust.labels_, clust_frac.labels_)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_min_cluster_size_invalid2(csr_container):
+    clust = OPTICS(min_cluster_size=len(X) + 1)
+    with pytest.raises(ValueError, match="must be no greater than the "):
+        clust.fit(X)
+
+    clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean")
+    with pytest.raises(ValueError, match="must be no greater than the "):
+        clust.fit(csr_container(X))
+
+
+def test_processing_order():
+    # Ensure that we consider all unprocessed points,
+    # not only direct neighbors. when picking the next point.
+    Y = [[0], [10], [-10], [25]]
+
+    clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
+    assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
+    assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
+    assert_array_equal(clust.ordering_, [0, 1, 2, 3])
+
+
+def test_compare_to_ELKI():
+    # Expected values, computed with (future) ELKI 0.7.5 using:
+    # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
+    #   -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
+    # where the FixedDBIDsFilter gives 0-indexed ids.
+    r1 = [
+        np.inf,
+        1.0574896366427478,
+        0.7587934993548423,
+        0.7290174038973836,
+        0.7290174038973836,
+        0.7290174038973836,
+        0.6861627576116127,
+        0.7587934993548423,
+        0.9280118450166668,
+        1.1748022534146194,
+        3.3355455741292257,
+        0.49618389254482587,
+        0.2552805046961355,
+        0.2552805046961355,
+        0.24944622248445714,
+        0.24944622248445714,
+        0.24944622248445714,
+        0.2552805046961355,
+        0.2552805046961355,
+        0.3086779122185853,
+        4.163024452756142,
+        1.623152630340929,
+        0.45315840475822655,
+        0.25468325192031926,
+        0.2254004358159971,
+        0.18765711877083036,
+        0.1821471333893275,
+        0.1821471333893275,
+        0.18765711877083036,
+        0.18765711877083036,
+        0.2240202988740153,
+        1.154337614548715,
+        1.342604473837069,
+        1.323308536402633,
+        0.8607514948648837,
+        0.27219111215810565,
+        0.13260875220533205,
+        0.13260875220533205,
+        0.09890587675958984,
+        0.09890587675958984,
+        0.13548790801634494,
+        0.1575483940837384,
+        0.17515137170530226,
+        0.17575920159442388,
+        0.27219111215810565,
+        0.6101447895405373,
+        1.3189208094864302,
+        1.323308536402633,
+        2.2509184159764577,
+        2.4517810628594527,
+        3.675977064404973,
+        3.8264795626020365,
+        2.9130735341510614,
+        2.9130735341510614,
+        2.9130735341510614,
+        2.9130735341510614,
+        2.8459300127258036,
+        2.8459300127258036,
+        2.8459300127258036,
+        3.0321982337972537,
+    ]
+    o1 = [
+        0,
+        3,
+        6,
+        4,
+        7,
+        8,
+        2,
+        9,
+        5,
+        1,
+        31,
+        30,
+        32,
+        34,
+        33,
+        38,
+        39,
+        35,
+        37,
+        36,
+        44,
+        21,
+        23,
+        24,
+        22,
+        25,
+        27,
+        29,
+        26,
+        28,
+        20,
+        40,
+        45,
+        46,
+        10,
+        15,
+        11,
+        13,
+        17,
+        19,
+        18,
+        12,
+        16,
+        14,
+        47,
+        49,
+        43,
+        48,
+        42,
+        41,
+        53,
+        57,
+        51,
+        52,
+        56,
+        59,
+        54,
+        55,
+        58,
+        50,
+    ]
+    p1 = [
+        -1,
+        0,
+        3,
+        6,
+        6,
+        6,
+        8,
+        3,
+        7,
+        5,
+        1,
+        31,
+        30,
+        30,
+        34,
+        34,
+        34,
+        32,
+        32,
+        37,
+        36,
+        44,
+        21,
+        23,
+        24,
+        22,
+        25,
+        25,
+        22,
+        22,
+        22,
+        21,
+        40,
+        45,
+        46,
+        10,
+        15,
+        15,
+        13,
+        13,
+        15,
+        11,
+        19,
+        15,
+        10,
+        47,
+        12,
+        45,
+        14,
+        43,
+        42,
+        53,
+        57,
+        57,
+        57,
+        57,
+        59,
+        59,
+        59,
+        58,
+    ]
+
+    # Tests against known extraction array
+    # Does NOT work with metric='euclidean', because sklearn euclidean has
+    # worse numeric precision. 'minkowski' is slower but more accurate.
+    clust1 = OPTICS(min_samples=5).fit(X)
+
+    assert_array_equal(clust1.ordering_, np.array(o1))
+    assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
+    assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1))
+    # ELKI currently does not print the core distances (which are not used much
+    # in literature, but we can at least ensure to have this consistency:
+    for i in clust1.ordering_[1:]:
+        assert clust1.reachability_[i] >= clust1.core_distances_[clust1.predecessor_[i]]
+
+    # Expected values, computed with (future) ELKI 0.7.5 using
+    r2 = [
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        0.27219111215810565,
+        0.13260875220533205,
+        0.13260875220533205,
+        0.09890587675958984,
+        0.09890587675958984,
+        0.13548790801634494,
+        0.1575483940837384,
+        0.17515137170530226,
+        0.17575920159442388,
+        0.27219111215810565,
+        0.4928068613197889,
+        np.inf,
+        0.2666183922512113,
+        0.18765711877083036,
+        0.1821471333893275,
+        0.1821471333893275,
+        0.1821471333893275,
+        0.18715928772277457,
+        0.18765711877083036,
+        0.18765711877083036,
+        0.25468325192031926,
+        np.inf,
+        0.2552805046961355,
+        0.2552805046961355,
+        0.24944622248445714,
+        0.24944622248445714,
+        0.24944622248445714,
+        0.2552805046961355,
+        0.2552805046961355,
+        0.3086779122185853,
+        0.34466409325984865,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+        np.inf,
+    ]
+    o2 = [
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        15,
+        11,
+        13,
+        17,
+        19,
+        18,
+        12,
+        16,
+        14,
+        47,
+        46,
+        20,
+        22,
+        25,
+        23,
+        27,
+        29,
+        24,
+        26,
+        28,
+        21,
+        30,
+        32,
+        34,
+        33,
+        38,
+        39,
+        35,
+        37,
+        36,
+        31,
+        40,
+        41,
+        42,
+        43,
+        44,
+        45,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+    ]
+    p2 = [
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        10,
+        15,
+        15,
+        13,
+        13,
+        15,
+        11,
+        19,
+        15,
+        10,
+        47,
+        -1,
+        20,
+        22,
+        25,
+        25,
+        25,
+        25,
+        22,
+        22,
+        23,
+        -1,
+        30,
+        30,
+        34,
+        34,
+        34,
+        32,
+        32,
+        37,
+        38,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+    ]
+    clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
+
+    assert_array_equal(clust2.ordering_, np.array(o2))
+    assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))
+    assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2))
+
+    index = np.where(clust1.core_distances_ <= 0.5)[0]
+    assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index])
+
+
+def test_extract_dbscan(global_dtype):
+    # testing an easy dbscan case. Not including clusters with different
+    # densities.
+    rng = np.random.RandomState(0)
+    n_points_per_cluster = 20
+    C1 = [-5, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    C2 = [4, -1] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    C3 = [1, 2] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    C4 = [-2, 3] + 0.2 * rng.randn(n_points_per_cluster, 2)
+    X = np.vstack((C1, C2, C3, C4)).astype(global_dtype, copy=False)
+
+    clust = OPTICS(cluster_method="dbscan", eps=0.5).fit(X)
+    assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])
+
+
+@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
+def test_precomputed_dists(global_dtype, csr_container):
+    redX = X[::2].astype(global_dtype, copy=False)
+    dists = pairwise_distances(redX, metric="euclidean")
+    dists = csr_container(dists) if csr_container is not None else dists
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", EfficiencyWarning)
+        clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(
+            dists
+        )
+    clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)
+
+    assert_allclose(clust1.reachability_, clust2.reachability_)
+    assert_array_equal(clust1.labels_, clust2.labels_)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_optics_input_not_modified_precomputed_sparse_nodiag(csr_container):
+    """Check that we don't modify in-place the pre-computed sparse matrix.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27508
+    """
+    X = np.random.RandomState(0).rand(6, 6)
+    # Add zeros on the diagonal that will be implicit when creating
+    # the sparse matrix. If `X` is modified in-place, the zeros from
+    # the diagonal will be made explicit.
+    np.fill_diagonal(X, 0)
+    X = csr_container(X)
+    assert all(row != col for row, col in zip(*X.nonzero()))
+    X_copy = X.copy()
+    OPTICS(metric="precomputed").fit(X)
+    # Make sure that we did not modify `X` in-place even by creating
+    # explicit 0s values.
+    assert X.nnz == X_copy.nnz
+    assert_array_equal(X.toarray(), X_copy.toarray())
+
+
+def test_optics_predecessor_correction_ordering():
+    """Check that cluster correction using predecessor is working as expected.
+
+    In the following example, the predecessor correction was not working properly
+    since it was not using the right indices.
+
+    This non-regression test check that reordering the data does not change the results.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/26324
+    """
+    X_1 = np.array([1, 2, 3, 1, 8, 8, 7, 100]).reshape(-1, 1)
+    reorder = [0, 1, 2, 4, 5, 6, 7, 3]
+    X_2 = X_1[reorder]
+
+    optics_1 = OPTICS(min_samples=3, metric="euclidean").fit(X_1)
+    optics_2 = OPTICS(min_samples=3, metric="euclidean").fit(X_2)
+
+    assert_array_equal(optics_1.labels_[reorder], optics_2.labels_)
@@ -0,0 +1,335 @@
+"""Testing for Spectral Clustering methods"""
+
+import pickle
+import re
+
+import numpy as np
+import pytest
+from scipy.linalg import LinAlgError
+
+from sklearn.cluster import SpectralClustering, spectral_clustering
+from sklearn.cluster._spectral import cluster_qr, discretize
+from sklearn.datasets import make_blobs
+from sklearn.feature_extraction import img_to_graph
+from sklearn.metrics import adjusted_rand_score
+from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS
+
+try:
+    from pyamg import smoothed_aggregation_solver  # noqa
+
+    amg_loaded = True
+except ImportError:
+    amg_loaded = False
+
+centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
+X, _ = make_blobs(
+    n_samples=60,
+    n_features=2,
+    centers=centers,
+    cluster_std=0.4,
+    shuffle=True,
+    random_state=0,
+)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+@pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg"))
+@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
+def test_spectral_clustering(eigen_solver, assign_labels, csr_container):
+    S = np.array(
+        [
+            [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
+            [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
+            [0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
+            [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
+        ]
+    )
+
+    for mat in (S, csr_container(S)):
+        model = SpectralClustering(
+            random_state=0,
+            n_clusters=2,
+            affinity="precomputed",
+            eigen_solver=eigen_solver,
+            assign_labels=assign_labels,
+        ).fit(mat)
+        labels = model.labels_
+        if labels[0] == 0:
+            labels = 1 - labels
+
+        assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1
+
+        model_copy = pickle.loads(pickle.dumps(model))
+        assert model_copy.n_clusters == model.n_clusters
+        assert model_copy.eigen_solver == model.eigen_solver
+        assert_array_equal(model_copy.labels_, model.labels_)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
+def test_spectral_clustering_sparse(assign_labels, coo_container):
+    X, y = make_blobs(
+        n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
+    )
+
+    S = rbf_kernel(X, gamma=1)
+    S = np.maximum(S - 1e-4, 0)
+    S = coo_container(S)
+
+    labels = (
+        SpectralClustering(
+            random_state=0,
+            n_clusters=2,
+            affinity="precomputed",
+            assign_labels=assign_labels,
+        )
+        .fit(S)
+        .labels_
+    )
+    assert adjusted_rand_score(y, labels) == 1
+
+
+def test_precomputed_nearest_neighbors_filtering():
+    # Test precomputed graph filtering when containing too many neighbors
+    X, y = make_blobs(
+        n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
+    )
+
+    n_neighbors = 2
+    results = []
+    for additional_neighbors in [0, 10]:
+        nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(X)
+        graph = nn.kneighbors_graph(X, mode="connectivity")
+        labels = (
+            SpectralClustering(
+                random_state=0,
+                n_clusters=2,
+                affinity="precomputed_nearest_neighbors",
+                n_neighbors=n_neighbors,
+            )
+            .fit(graph)
+            .labels_
+        )
+        results.append(labels)
+
+    assert_array_equal(results[0], results[1])
+
+
+def test_affinities():
+    # Note: in the following, random_state has been selected to have
+    # a dataset that yields a stable eigen decomposition both when built
+    # on OSX and Linux
+    X, y = make_blobs(
+        n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
+    )
+    # nearest neighbors affinity
+    sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
+    with pytest.warns(UserWarning, match="not fully connected"):
+        sp.fit(X)
+    assert adjusted_rand_score(y, sp.labels_) == 1
+
+    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
+    labels = sp.fit(X).labels_
+    assert adjusted_rand_score(y, labels) == 1
+
+    X = check_random_state(10).rand(10, 5) * 10
+
+    kernels_available = kernel_metrics()
+    for kern in kernels_available:
+        # Additive chi^2 gives a negative similarity matrix which
+        # doesn't make sense for spectral clustering
+        if kern != "additive_chi2":
+            sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0)
+            labels = sp.fit(X).labels_
+            assert (X.shape[0],) == labels.shape
+
+    sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0)
+    labels = sp.fit(X).labels_
+    assert (X.shape[0],) == labels.shape
+
+    def histogram(x, y, **kwargs):
+        # Histogram kernel implemented as a callable.
+        assert kwargs == {}  # no kernel_params that we didn't ask for
+        return np.minimum(x, y).sum()
+
+    sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
+    labels = sp.fit(X).labels_
+    assert (X.shape[0],) == labels.shape
+
+
+def test_cluster_qr():
+    # cluster_qr by itself should not be used for clustering generic data
+    # other than the rows of the eigenvectors within spectral clustering,
+    # but cluster_qr must still preserve the labels for different dtypes
+    # of the generic fixed input even if the labels may be meaningless.
+    random_state = np.random.RandomState(seed=8)
+    n_samples, n_components = 10, 5
+    data = random_state.randn(n_samples, n_components)
+    labels_float64 = cluster_qr(data.astype(np.float64))
+    # Each sample is assigned a cluster identifier
+    assert labels_float64.shape == (n_samples,)
+    # All components should be covered by the assignment
+    assert np.array_equal(np.unique(labels_float64), np.arange(n_components))
+    # Single precision data should yield the same cluster assignments
+    labels_float32 = cluster_qr(data.astype(np.float32))
+    assert np.array_equal(labels_float64, labels_float32)
+
+
+def test_cluster_qr_permutation_invariance():
+    # cluster_qr must be invariant to sample permutation.
+    random_state = np.random.RandomState(seed=8)
+    n_samples, n_components = 100, 5
+    data = random_state.randn(n_samples, n_components)
+    perm = random_state.permutation(n_samples)
+    assert np.array_equal(
+        cluster_qr(data)[perm],
+        cluster_qr(data[perm]),
+    )
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+@pytest.mark.parametrize("n_samples", [50, 100, 150, 500])
+def test_discretize(n_samples, coo_container):
+    # Test the discretize using a noise assignment matrix
+    random_state = np.random.RandomState(seed=8)
+    for n_class in range(2, 10):
+        # random class labels
+        y_true = random_state.randint(0, n_class + 1, n_samples)
+        y_true = np.array(y_true, float)
+        # noise class assignment matrix
+        y_indicator = coo_container(
+            (np.ones(n_samples), (np.arange(n_samples), y_true)),
+            shape=(n_samples, n_class + 1),
+        )
+        y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn(
+            n_samples, n_class + 1
+        )
+        y_pred = discretize(y_true_noisy, random_state=random_state)
+        assert adjusted_rand_score(y_true, y_pred) > 0.8
+
+
+# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
+# https://github.com/scikit-learn/scikit-learn/issues/15913
+@pytest.mark.filterwarnings(
+    "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
+)
+# TODO: Remove when pyamg removes the use of np.float
+@pytest.mark.filterwarnings(
+    "ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
+)
+# TODO: Remove when pyamg removes the use of pinv2
+@pytest.mark.filterwarnings(
+    "ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
+)
+# TODO: Remove when pyamg removes the use of np.find_common_type
+@pytest.mark.filterwarnings(
+    "ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
+)
+def test_spectral_clustering_with_arpack_amg_solvers():
+    # Test that spectral_clustering is the same for arpack and amg solver
+    # Based on toy example from plot_segmentation_toy.py
+
+    # a small two coin image
+    x, y = np.indices((40, 40))
+
+    center1, center2 = (14, 12), (20, 25)
+    radius1, radius2 = 8, 7
+
+    circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1**2
+    circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2**2
+
+    circles = circle1 | circle2
+    mask = circles.copy()
+    img = circles.astype(float)
+
+    graph = img_to_graph(img, mask=mask)
+    graph.data = np.exp(-graph.data / graph.data.std())
+
+    labels_arpack = spectral_clustering(
+        graph, n_clusters=2, eigen_solver="arpack", random_state=0
+    )
+
+    assert len(np.unique(labels_arpack)) == 2
+
+    if amg_loaded:
+        labels_amg = spectral_clustering(
+            graph, n_clusters=2, eigen_solver="amg", random_state=0
+        )
+        assert adjusted_rand_score(labels_arpack, labels_amg) == 1
+    else:
+        with pytest.raises(ValueError):
+            spectral_clustering(graph, n_clusters=2, eigen_solver="amg", random_state=0)
+
+
+def test_n_components():
+    # Test that after adding n_components, result is different and
+    # n_components = n_clusters by default
+    X, y = make_blobs(
+        n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
+    )
+    sp = SpectralClustering(n_clusters=2, random_state=0)
+    labels = sp.fit(X).labels_
+    # set n_components = n_cluster and test if result is the same
+    labels_same_ncomp = (
+        SpectralClustering(n_clusters=2, n_components=2, random_state=0).fit(X).labels_
+    )
+    # test that n_components=n_clusters by default
+    assert_array_equal(labels, labels_same_ncomp)
+
+    # test that n_components affect result
+    # n_clusters=8 by default, and set n_components=2
+    labels_diff_ncomp = (
+        SpectralClustering(n_components=2, random_state=0).fit(X).labels_
+    )
+    assert not np.array_equal(labels, labels_diff_ncomp)
+
+
+@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
+def test_verbose(assign_labels, capsys):
+    # Check verbose mode of KMeans for better coverage.
+    X, y = make_blobs(
+        n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
+    )
+
+    SpectralClustering(n_clusters=2, random_state=42, verbose=1).fit(X)
+
+    captured = capsys.readouterr()
+
+    assert re.search(r"Computing label assignment using", captured.out)
+
+    if assign_labels == "kmeans":
+        assert re.search(r"Initialization complete", captured.out)
+        assert re.search(r"Iteration [0-9]+, inertia", captured.out)
+
+
+def test_spectral_clustering_np_matrix_raises():
+    """Check that spectral_clustering raises an informative error when passed
+    a np.matrix. See #10993"""
+    X = np.matrix([[0.0, 2.0], [2.0, 0.0]])
+
+    msg = r"np\.matrix is not supported. Please convert to a numpy array"
+    with pytest.raises(TypeError, match=msg):
+        spectral_clustering(X)
+
+
+def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch):
+    """Check that discretize raises LinAlgError when svd never converges.
+
+    Non-regression test for #21380
+    """
+
+    def new_svd(*args, **kwargs):
+        raise LinAlgError()
+
+    monkeypatch.setattr(np.linalg, "svd", new_svd)
+    vectors = np.ones((10, 4))
+
+    with pytest.raises(LinAlgError, match="SVD did not converge"):
+        discretize(vectors)
@@ -0,0 +1,20 @@
+"""Meta-estimators for building composite models with transformers
+
+In addition to its current contents, this module will eventually be home to
+refurbished versions of Pipeline and FeatureUnion.
+
+"""
+
+from ._column_transformer import (
+    ColumnTransformer,
+    make_column_selector,
+    make_column_transformer,
+)
+from ._target import TransformedTargetRegressor
+
+__all__ = [
+    "ColumnTransformer",
+    "make_column_transformer",
+    "TransformedTargetRegressor",
+    "make_column_selector",
+]
@@ -0,0 +1,352 @@
+# Authors: Andreas Mueller <andreas.mueller@columbia.edu>
+#          Guillaume Lemaitre <guillaume.lemaitre@inria.fr>
+# License: BSD 3 clause
+
+import warnings
+
+import numpy as np
+
+from ..base import BaseEstimator, RegressorMixin, _fit_context, clone
+from ..exceptions import NotFittedError
+from ..preprocessing import FunctionTransformer
+from ..utils import _safe_indexing, check_array
+from ..utils._param_validation import HasMethods
+from ..utils._tags import _safe_tags
+from ..utils.metadata_routing import (
+    _raise_for_unsupported_routing,
+    _RoutingNotSupportedMixin,
+)
+from ..utils.validation import check_is_fitted
+
+__all__ = ["TransformedTargetRegressor"]
+
+
+class TransformedTargetRegressor(
+    _RoutingNotSupportedMixin, RegressorMixin, BaseEstimator
+):
+    """Meta-estimator to regress on a transformed target.
+
+    Useful for applying a non-linear transformation to the target `y` in
+    regression problems. This transformation can be given as a Transformer
+    such as the :class:`~sklearn.preprocessing.QuantileTransformer` or as a
+    function and its inverse such as `np.log` and `np.exp`.
+
+    The computation during :meth:`fit` is::
+
+        regressor.fit(X, func(y))
+
+    or::
+
+        regressor.fit(X, transformer.transform(y))
+
+    The computation during :meth:`predict` is::
+
+        inverse_func(regressor.predict(X))
+
+    or::
+
+        transformer.inverse_transform(regressor.predict(X))
+
+    Read more in the :ref:`User Guide <transformed_target_regressor>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    regressor : object, default=None
+        Regressor object such as derived from
+        :class:`~sklearn.base.RegressorMixin`. This regressor will
+        automatically be cloned each time prior to fitting. If `regressor is
+        None`, :class:`~sklearn.linear_model.LinearRegression` is created and used.
+
+    transformer : object, default=None
+        Estimator object such as derived from
+        :class:`~sklearn.base.TransformerMixin`. Cannot be set at the same time
+        as `func` and `inverse_func`. If `transformer is None` as well as
+        `func` and `inverse_func`, the transformer will be an identity
+        transformer. Note that the transformer will be cloned during fitting.
+        Also, the transformer is restricting `y` to be a numpy array.
+
+    func : function, default=None
+        Function to apply to `y` before passing to :meth:`fit`. Cannot be set
+        at the same time as `transformer`. If `func is None`, the function used will be
+        the identity function. If `func` is set, `inverse_func` also needs to be
+        provided. The function needs to return a 2-dimensional array.
+
+    inverse_func : function, default=None
+        Function to apply to the prediction of the regressor. Cannot be set at
+        the same time as `transformer`. The inverse function is used to return
+        predictions to the same space of the original training labels. If
+        `inverse_func` is set, `func` also needs to be provided. The inverse
+        function needs to return a 2-dimensional array.
+
+    check_inverse : bool, default=True
+        Whether to check that `transform` followed by `inverse_transform`
+        or `func` followed by `inverse_func` leads to the original targets.
+
+    Attributes
+    ----------
+    regressor_ : object
+        Fitted regressor.
+
+    transformer_ : object
+        Transformer used in :meth:`fit` and :meth:`predict`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying regressor exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.preprocessing.FunctionTransformer : Construct a transformer from an
+        arbitrary callable.
+
+    Notes
+    -----
+    Internally, the target `y` is always converted into a 2-dimensional array
+    to be used by scikit-learn transformers. At the time of prediction, the
+    output will be reshaped to a have the same number of dimensions as `y`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.compose import TransformedTargetRegressor
+    >>> tt = TransformedTargetRegressor(regressor=LinearRegression(),
+    ...                                 func=np.log, inverse_func=np.exp)
+    >>> X = np.arange(4).reshape(-1, 1)
+    >>> y = np.exp(2 * X).ravel()
+    >>> tt.fit(X, y)
+    TransformedTargetRegressor(...)
+    >>> tt.score(X, y)
+    1.0
+    >>> tt.regressor_.coef_
+    array([2.])
+
+    For a more detailed example use case refer to
+    :ref:`sphx_glr_auto_examples_compose_plot_transformed_target.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "regressor": [HasMethods(["fit", "predict"]), None],
+        "transformer": [HasMethods("transform"), None],
+        "func": [callable, None],
+        "inverse_func": [callable, None],
+        "check_inverse": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        regressor=None,
+        *,
+        transformer=None,
+        func=None,
+        inverse_func=None,
+        check_inverse=True,
+    ):
+        self.regressor = regressor
+        self.transformer = transformer
+        self.func = func
+        self.inverse_func = inverse_func
+        self.check_inverse = check_inverse
+
+    def _fit_transformer(self, y):
+        """Check transformer and fit transformer.
+
+        Create the default transformer, fit it and make additional inverse
+        check on a subset (optional).
+
+        """
+        if self.transformer is not None and (
+            self.func is not None or self.inverse_func is not None
+        ):
+            raise ValueError(
+                "'transformer' and functions 'func'/'inverse_func' cannot both be set."
+            )
+        elif self.transformer is not None:
+            self.transformer_ = clone(self.transformer)
+        else:
+            if (self.func is not None and self.inverse_func is None) or (
+                self.func is None and self.inverse_func is not None
+            ):
+                lacking_param, existing_param = (
+                    ("func", "inverse_func")
+                    if self.func is None
+                    else ("inverse_func", "func")
+                )
+                raise ValueError(
+                    f"When '{existing_param}' is provided, '{lacking_param}' must also"
+                    f" be provided. If {lacking_param} is supposed to be the default,"
+                    " you need to explicitly pass it the identity function."
+                )
+            self.transformer_ = FunctionTransformer(
+                func=self.func,
+                inverse_func=self.inverse_func,
+                validate=True,
+                check_inverse=self.check_inverse,
+            )
+        # XXX: sample_weight is not currently passed to the
+        # transformer. However, if transformer starts using sample_weight, the
+        # code should be modified accordingly. At the time to consider the
+        # sample_prop feature, it is also a good use case to be considered.
+        self.transformer_.fit(y)
+        if self.check_inverse:
+            idx_selected = slice(None, None, max(1, y.shape[0] // 10))
+            y_sel = _safe_indexing(y, idx_selected)
+            y_sel_t = self.transformer_.transform(y_sel)
+            if not np.allclose(y_sel, self.transformer_.inverse_transform(y_sel_t)):
+                warnings.warn(
+                    (
+                        "The provided functions or transformer are"
+                        " not strictly inverse of each other. If"
+                        " you are sure you want to proceed regardless"
+                        ", set 'check_inverse=False'"
+                    ),
+                    UserWarning,
+                )
+
+    @_fit_context(
+        # TransformedTargetRegressor.regressor/transformer are not validated yet.
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y, **fit_params):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        **fit_params : dict
+            Parameters passed to the `fit` method of the underlying
+            regressor.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        _raise_for_unsupported_routing(self, "fit", **fit_params)
+        if y is None:
+            raise ValueError(
+                f"This {self.__class__.__name__} estimator "
+                "requires y to be passed, but the target y is None."
+            )
+        y = check_array(
+            y,
+            input_name="y",
+            accept_sparse=False,
+            force_all_finite=True,
+            ensure_2d=False,
+            dtype="numeric",
+            allow_nd=True,
+        )
+
+        # store the number of dimension of the target to predict an array of
+        # similar shape at predict
+        self._training_dim = y.ndim
+
+        # transformers are designed to modify X which is 2d dimensional, we
+        # need to modify y accordingly.
+        if y.ndim == 1:
+            y_2d = y.reshape(-1, 1)
+        else:
+            y_2d = y
+        self._fit_transformer(y_2d)
+
+        # transform y and convert back to 1d array if needed
+        y_trans = self.transformer_.transform(y_2d)
+        # FIXME: a FunctionTransformer can return a 1D array even when validate
+        # is set to True. Therefore, we need to check the number of dimension
+        # first.
+        if y_trans.ndim == 2 and y_trans.shape[1] == 1:
+            y_trans = y_trans.squeeze(axis=1)
+
+        if self.regressor is None:
+            from ..linear_model import LinearRegression
+
+            self.regressor_ = LinearRegression()
+        else:
+            self.regressor_ = clone(self.regressor)
+
+        self.regressor_.fit(X, y_trans, **fit_params)
+
+        if hasattr(self.regressor_, "feature_names_in_"):
+            self.feature_names_in_ = self.regressor_.feature_names_in_
+
+        return self
+
+    def predict(self, X, **predict_params):
+        """Predict using the base regressor, applying inverse.
+
+        The regressor is used to predict and the `inverse_func` or
+        `inverse_transform` is applied before returning the prediction.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        **predict_params : dict of str -> object
+            Parameters passed to the `predict` method of the underlying
+            regressor.
+
+        Returns
+        -------
+        y_hat : ndarray of shape (n_samples,)
+            Predicted values.
+        """
+        check_is_fitted(self)
+        pred = self.regressor_.predict(X, **predict_params)
+        if pred.ndim == 1:
+            pred_trans = self.transformer_.inverse_transform(pred.reshape(-1, 1))
+        else:
+            pred_trans = self.transformer_.inverse_transform(pred)
+        if (
+            self._training_dim == 1
+            and pred_trans.ndim == 2
+            and pred_trans.shape[1] == 1
+        ):
+            pred_trans = pred_trans.squeeze(axis=1)
+
+        return pred_trans
+
+    def _more_tags(self):
+        regressor = self.regressor
+        if regressor is None:
+            from ..linear_model import LinearRegression
+
+            regressor = LinearRegression()
+
+        return {
+            "poor_score": True,
+            "multioutput": _safe_tags(regressor, key="multioutput"),
+        }
+
+    @property
+    def n_features_in_(self):
+        """Number of features seen during :term:`fit`."""
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() returns False the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute.".format(
+                    self.__class__.__name__
+                )
+            ) from nfe
+
+        return self.regressor_.n_features_in_
@@ -0,0 +1,395 @@
+import numpy as np
+import pytest
+
+from sklearn import datasets
+from sklearn.base import BaseEstimator, TransformerMixin, clone
+from sklearn.compose import TransformedTargetRegressor
+from sklearn.dummy import DummyRegressor
+from sklearn.linear_model import LinearRegression, OrthogonalMatchingPursuit
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer, StandardScaler
+from sklearn.utils._testing import assert_allclose, assert_no_warnings
+
+friedman = datasets.make_friedman1(random_state=0)
+
+
+def test_transform_target_regressor_error():
+    X, y = friedman
+    # provide a transformer and functions at the same time
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(),
+        transformer=StandardScaler(),
+        func=np.exp,
+        inverse_func=np.log,
+    )
+    with pytest.raises(
+        ValueError,
+        match="'transformer' and functions 'func'/'inverse_func' cannot both be set.",
+    ):
+        regr.fit(X, y)
+    # fit with sample_weight with a regressor which does not support it
+    sample_weight = np.ones((y.shape[0],))
+    regr = TransformedTargetRegressor(
+        regressor=OrthogonalMatchingPursuit(), transformer=StandardScaler()
+    )
+    with pytest.raises(
+        TypeError,
+        match=r"fit\(\) got an unexpected " "keyword argument 'sample_weight'",
+    ):
+        regr.fit(X, y, sample_weight=sample_weight)
+
+    # one of (func, inverse_func) is given but the other one is not
+    regr = TransformedTargetRegressor(func=np.exp)
+    with pytest.raises(
+        ValueError,
+        match="When 'func' is provided, 'inverse_func' must also be provided",
+    ):
+        regr.fit(X, y)
+
+    regr = TransformedTargetRegressor(inverse_func=np.log)
+    with pytest.raises(
+        ValueError,
+        match="When 'inverse_func' is provided, 'func' must also be provided",
+    ):
+        regr.fit(X, y)
+
+
+def test_transform_target_regressor_invertible():
+    X, y = friedman
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(),
+        func=np.sqrt,
+        inverse_func=np.log,
+        check_inverse=True,
+    )
+    with pytest.warns(
+        UserWarning,
+        match=(
+            "The provided functions or"
+            " transformer are not strictly inverse of each other."
+        ),
+    ):
+        regr.fit(X, y)
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log
+    )
+    regr.set_params(check_inverse=False)
+    assert_no_warnings(regr.fit, X, y)
+
+
+def _check_standard_scaled(y, y_pred):
+    y_mean = np.mean(y, axis=0)
+    y_std = np.std(y, axis=0)
+    assert_allclose((y - y_mean) / y_std, y_pred)
+
+
+def _check_shifted_by_one(y, y_pred):
+    assert_allclose(y + 1, y_pred)
+
+
+def test_transform_target_regressor_functions():
+    X, y = friedman
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), func=np.log, inverse_func=np.exp
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    # check the transformer output
+    y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze()
+    assert_allclose(np.log(y), y_tran)
+    assert_allclose(
+        y, regr.transformer_.inverse_transform(y_tran.reshape(-1, 1)).squeeze()
+    )
+    assert y.shape == y_pred.shape
+    assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
+    # check the regressor output
+    lr = LinearRegression().fit(X, regr.func(y))
+    assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
+
+
+def test_transform_target_regressor_functions_multioutput():
+    X = friedman[0]
+    y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), func=np.log, inverse_func=np.exp
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    # check the transformer output
+    y_tran = regr.transformer_.transform(y)
+    assert_allclose(np.log(y), y_tran)
+    assert_allclose(y, regr.transformer_.inverse_transform(y_tran))
+    assert y.shape == y_pred.shape
+    assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
+    # check the regressor output
+    lr = LinearRegression().fit(X, regr.func(y))
+    assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
+
+
+@pytest.mark.parametrize(
+    "X,y", [friedman, (friedman[0], np.vstack((friedman[1], friedman[1] ** 2 + 1)).T)]
+)
+def test_transform_target_regressor_1d_transformer(X, y):
+    # All transformer in scikit-learn expect 2D data. FunctionTransformer with
+    # validate=False lift this constraint without checking that the input is a
+    # 2D vector. We check the consistency of the data shape using a 1D and 2D y
+    # array.
+    transformer = FunctionTransformer(
+        func=lambda x: x + 1, inverse_func=lambda x: x - 1
+    )
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), transformer=transformer
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    assert y.shape == y_pred.shape
+    # consistency forward transform
+    y_tran = regr.transformer_.transform(y)
+    _check_shifted_by_one(y, y_tran)
+    assert y.shape == y_pred.shape
+    # consistency inverse transform
+    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
+    # consistency of the regressor
+    lr = LinearRegression()
+    transformer2 = clone(transformer)
+    lr.fit(X, transformer2.fit_transform(y))
+    y_lr_pred = lr.predict(X)
+    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
+    assert_allclose(regr.regressor_.coef_, lr.coef_)
+
+
+@pytest.mark.parametrize(
+    "X,y", [friedman, (friedman[0], np.vstack((friedman[1], friedman[1] ** 2 + 1)).T)]
+)
+def test_transform_target_regressor_2d_transformer(X, y):
+    # Check consistency with transformer accepting only 2D array and a 1D/2D y
+    # array.
+    transformer = StandardScaler()
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), transformer=transformer
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    assert y.shape == y_pred.shape
+    # consistency forward transform
+    if y.ndim == 1:  # create a 2D array and squeeze results
+        y_tran = regr.transformer_.transform(y.reshape(-1, 1))
+    else:
+        y_tran = regr.transformer_.transform(y)
+    _check_standard_scaled(y, y_tran.squeeze())
+    assert y.shape == y_pred.shape
+    # consistency inverse transform
+    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
+    # consistency of the regressor
+    lr = LinearRegression()
+    transformer2 = clone(transformer)
+    if y.ndim == 1:  # create a 2D array and squeeze results
+        lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze())
+        y_lr_pred = lr.predict(X).reshape(-1, 1)
+        y_pred2 = transformer2.inverse_transform(y_lr_pred).squeeze()
+    else:
+        lr.fit(X, transformer2.fit_transform(y))
+        y_lr_pred = lr.predict(X)
+        y_pred2 = transformer2.inverse_transform(y_lr_pred)
+
+    assert_allclose(y_pred, y_pred2)
+    assert_allclose(regr.regressor_.coef_, lr.coef_)
+
+
+def test_transform_target_regressor_2d_transformer_multioutput():
+    # Check consistency with transformer accepting only 2D array and a 2D y
+    # array.
+    X = friedman[0]
+    y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T
+    transformer = StandardScaler()
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), transformer=transformer
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    assert y.shape == y_pred.shape
+    # consistency forward transform
+    y_tran = regr.transformer_.transform(y)
+    _check_standard_scaled(y, y_tran)
+    assert y.shape == y_pred.shape
+    # consistency inverse transform
+    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
+    # consistency of the regressor
+    lr = LinearRegression()
+    transformer2 = clone(transformer)
+    lr.fit(X, transformer2.fit_transform(y))
+    y_lr_pred = lr.predict(X)
+    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
+    assert_allclose(regr.regressor_.coef_, lr.coef_)
+
+
+def test_transform_target_regressor_3d_target():
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/18866
+    # Check with a 3D target with a transformer that reshapes the target
+    X = friedman[0]
+    y = np.tile(friedman[1].reshape(-1, 1, 1), [1, 3, 2])
+
+    def flatten_data(data):
+        return data.reshape(data.shape[0], -1)
+
+    def unflatten_data(data):
+        return data.reshape(data.shape[0], -1, 2)
+
+    transformer = FunctionTransformer(func=flatten_data, inverse_func=unflatten_data)
+    regr = TransformedTargetRegressor(
+        regressor=LinearRegression(), transformer=transformer
+    )
+    y_pred = regr.fit(X, y).predict(X)
+    assert y.shape == y_pred.shape
+
+
+def test_transform_target_regressor_multi_to_single():
+    X = friedman[0]
+    y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)])
+
+    def func(y):
+        out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)
+        return out[:, np.newaxis]
+
+    def inverse_func(y):
+        return y
+
+    tt = TransformedTargetRegressor(
+        func=func, inverse_func=inverse_func, check_inverse=False
+    )
+    tt.fit(X, y)
+    y_pred_2d_func = tt.predict(X)
+    assert y_pred_2d_func.shape == (100, 1)
+
+    # force that the function only return a 1D array
+    def func(y):
+        return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)
+
+    tt = TransformedTargetRegressor(
+        func=func, inverse_func=inverse_func, check_inverse=False
+    )
+    tt.fit(X, y)
+    y_pred_1d_func = tt.predict(X)
+    assert y_pred_1d_func.shape == (100, 1)
+
+    assert_allclose(y_pred_1d_func, y_pred_2d_func)
+
+
+class DummyCheckerArrayTransformer(TransformerMixin, BaseEstimator):
+    def fit(self, X, y=None):
+        assert isinstance(X, np.ndarray)
+        return self
+
+    def transform(self, X):
+        assert isinstance(X, np.ndarray)
+        return X
+
+    def inverse_transform(self, X):
+        assert isinstance(X, np.ndarray)
+        return X
+
+
+class DummyCheckerListRegressor(DummyRegressor):
+    def fit(self, X, y, sample_weight=None):
+        assert isinstance(X, list)
+        return super().fit(X, y, sample_weight)
+
+    def predict(self, X):
+        assert isinstance(X, list)
+        return super().predict(X)
+
+
+def test_transform_target_regressor_ensure_y_array():
+    # check that the target ``y`` passed to the transformer will always be a
+    # numpy array. Similarly, if ``X`` is passed as a list, we check that the
+    # predictor receive as it is.
+    X, y = friedman
+    tt = TransformedTargetRegressor(
+        transformer=DummyCheckerArrayTransformer(),
+        regressor=DummyCheckerListRegressor(),
+        check_inverse=False,
+    )
+    tt.fit(X.tolist(), y.tolist())
+    tt.predict(X.tolist())
+    with pytest.raises(AssertionError):
+        tt.fit(X, y.tolist())
+    with pytest.raises(AssertionError):
+        tt.predict(X)
+
+
+class DummyTransformer(TransformerMixin, BaseEstimator):
+    """Dummy transformer which count how many time fit was called."""
+
+    def __init__(self, fit_counter=0):
+        self.fit_counter = fit_counter
+
+    def fit(self, X, y=None):
+        self.fit_counter += 1
+        return self
+
+    def transform(self, X):
+        return X
+
+    def inverse_transform(self, X):
+        return X
+
+
+@pytest.mark.parametrize("check_inverse", [False, True])
+def test_transform_target_regressor_count_fit(check_inverse):
+    # regression test for gh-issue #11618
+    # check that we only call a single time fit for the transformer
+    X, y = friedman
+    ttr = TransformedTargetRegressor(
+        transformer=DummyTransformer(), check_inverse=check_inverse
+    )
+    ttr.fit(X, y)
+    assert ttr.transformer_.fit_counter == 1
+
+
+class DummyRegressorWithExtraFitParams(DummyRegressor):
+    def fit(self, X, y, sample_weight=None, check_input=True):
+        # on the test below we force this to false, we make sure this is
+        # actually passed to the regressor
+        assert not check_input
+        return super().fit(X, y, sample_weight)
+
+
+def test_transform_target_regressor_pass_fit_parameters():
+    X, y = friedman
+    regr = TransformedTargetRegressor(
+        regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()
+    )
+
+    regr.fit(X, y, check_input=False)
+    assert regr.transformer_.fit_counter == 1
+
+
+def test_transform_target_regressor_route_pipeline():
+    X, y = friedman
+
+    regr = TransformedTargetRegressor(
+        regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()
+    )
+    estimators = [("normalize", StandardScaler()), ("est", regr)]
+
+    pip = Pipeline(estimators)
+    pip.fit(X, y, **{"est__check_input": False})
+
+    assert regr.transformer_.fit_counter == 1
+
+
+class DummyRegressorWithExtraPredictParams(DummyRegressor):
+    def predict(self, X, check_input=True):
+        # In the test below we make sure that the check input parameter is
+        # passed as false
+        self.predict_called = True
+        assert not check_input
+        return super().predict(X)
+
+
+def test_transform_target_regressor_pass_extra_predict_parameters():
+    # Checks that predict kwargs are passed to regressor.
+    X, y = friedman
+    regr = TransformedTargetRegressor(
+        regressor=DummyRegressorWithExtraPredictParams(), transformer=DummyTransformer()
+    )
+
+    regr.fit(X, y)
+    regr.predict(X, check_input=False)
+    assert regr.regressor_.predict_called
@@ -0,0 +1,315 @@
+import builtins
+import platform
+import sys
+from contextlib import suppress
+from functools import wraps
+from os import environ
+from unittest import SkipTest
+
+import joblib
+import numpy as np
+import pytest
+from _pytest.doctest import DoctestItem
+from threadpoolctl import threadpool_limits
+
+from sklearn import config_context, set_config
+from sklearn._min_dependencies import PYTEST_MIN_VERSION
+from sklearn.datasets import (
+    fetch_20newsgroups,
+    fetch_20newsgroups_vectorized,
+    fetch_california_housing,
+    fetch_covtype,
+    fetch_kddcup99,
+    fetch_lfw_pairs,
+    fetch_lfw_people,
+    fetch_olivetti_faces,
+    fetch_rcv1,
+    fetch_species_distributions,
+)
+from sklearn.tests import random_seed
+from sklearn.utils._testing import get_pytest_filterwarning_lines
+from sklearn.utils.fixes import (
+    _IS_32BIT,
+    np_base_version,
+    parse_version,
+    sp_version,
+)
+
+if parse_version(pytest.__version__) < parse_version(PYTEST_MIN_VERSION):
+    raise ImportError(
+        f"Your version of pytest is too old. Got version {pytest.__version__}, you"
+        f" should have pytest >= {PYTEST_MIN_VERSION} installed."
+    )
+
+scipy_datasets_require_network = sp_version >= parse_version("1.10")
+
+
+@pytest.fixture
+def enable_slep006():
+    """Enable SLEP006 for all tests."""
+    with config_context(enable_metadata_routing=True):
+        yield
+
+
+def raccoon_face_or_skip():
+    # SciPy >= 1.10 requires network to access to get data
+    if scipy_datasets_require_network:
+        run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
+        if not run_network_tests:
+            raise SkipTest("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
+
+        try:
+            import pooch  # noqa
+        except ImportError:
+            raise SkipTest("test requires pooch to be installed")
+
+        from scipy.datasets import face
+    else:
+        from scipy.misc import face
+
+    return face(gray=True)
+
+
+dataset_fetchers = {
+    "fetch_20newsgroups_fxt": fetch_20newsgroups,
+    "fetch_20newsgroups_vectorized_fxt": fetch_20newsgroups_vectorized,
+    "fetch_california_housing_fxt": fetch_california_housing,
+    "fetch_covtype_fxt": fetch_covtype,
+    "fetch_kddcup99_fxt": fetch_kddcup99,
+    "fetch_lfw_pairs_fxt": fetch_lfw_pairs,
+    "fetch_lfw_people_fxt": fetch_lfw_people,
+    "fetch_olivetti_faces_fxt": fetch_olivetti_faces,
+    "fetch_rcv1_fxt": fetch_rcv1,
+    "fetch_species_distributions_fxt": fetch_species_distributions,
+}
+
+if scipy_datasets_require_network:
+    dataset_fetchers["raccoon_face_fxt"] = raccoon_face_or_skip
+
+_SKIP32_MARK = pytest.mark.skipif(
+    environ.get("SKLEARN_RUN_FLOAT32_TESTS", "0") != "1",
+    reason="Set SKLEARN_RUN_FLOAT32_TESTS=1 to run float32 dtype tests",
+)
+
+
+# Global fixtures
+@pytest.fixture(params=[pytest.param(np.float32, marks=_SKIP32_MARK), np.float64])
+def global_dtype(request):
+    yield request.param
+
+
+def _fetch_fixture(f):
+    """Fetch dataset (download if missing and requested by environment)."""
+    download_if_missing = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
+
+    @wraps(f)
+    def wrapped(*args, **kwargs):
+        kwargs["download_if_missing"] = download_if_missing
+        try:
+            return f(*args, **kwargs)
+        except OSError as e:
+            if str(e) != "Data not found and `download_if_missing` is False":
+                raise
+            pytest.skip("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
+
+    return pytest.fixture(lambda: wrapped)
+
+
+# Adds fixtures for fetching data
+fetch_20newsgroups_fxt = _fetch_fixture(fetch_20newsgroups)
+fetch_20newsgroups_vectorized_fxt = _fetch_fixture(fetch_20newsgroups_vectorized)
+fetch_california_housing_fxt = _fetch_fixture(fetch_california_housing)
+fetch_covtype_fxt = _fetch_fixture(fetch_covtype)
+fetch_kddcup99_fxt = _fetch_fixture(fetch_kddcup99)
+fetch_lfw_pairs_fxt = _fetch_fixture(fetch_lfw_pairs)
+fetch_lfw_people_fxt = _fetch_fixture(fetch_lfw_people)
+fetch_olivetti_faces_fxt = _fetch_fixture(fetch_olivetti_faces)
+fetch_rcv1_fxt = _fetch_fixture(fetch_rcv1)
+fetch_species_distributions_fxt = _fetch_fixture(fetch_species_distributions)
+raccoon_face_fxt = pytest.fixture(raccoon_face_or_skip)
+
+
+def pytest_collection_modifyitems(config, items):
+    """Called after collect is completed.
+
+    Parameters
+    ----------
+    config : pytest config
+    items : list of collected items
+    """
+    run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
+    skip_network = pytest.mark.skip(
+        reason="test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0"
+    )
+
+    # download datasets during collection to avoid thread unsafe behavior
+    # when running pytest in parallel with pytest-xdist
+    dataset_features_set = set(dataset_fetchers)
+    datasets_to_download = set()
+
+    for item in items:
+        if isinstance(item, DoctestItem) and "fetch_" in item.name:
+            fetcher_function_name = item.name.split(".")[-1]
+            dataset_fetchers_key = f"{fetcher_function_name}_fxt"
+            dataset_to_fetch = set([dataset_fetchers_key]) & dataset_features_set
+        elif not hasattr(item, "fixturenames"):
+            continue
+        else:
+            item_fixtures = set(item.fixturenames)
+            dataset_to_fetch = item_fixtures & dataset_features_set
+
+        if not dataset_to_fetch:
+            continue
+
+        if run_network_tests:
+            datasets_to_download |= dataset_to_fetch
+        else:
+            # network tests are skipped
+            item.add_marker(skip_network)
+
+    # Only download datasets on the first worker spawned by pytest-xdist
+    # to avoid thread unsafe behavior. If pytest-xdist is not used, we still
+    # download before tests run.
+    worker_id = environ.get("PYTEST_XDIST_WORKER", "gw0")
+    if worker_id == "gw0" and run_network_tests:
+        for name in datasets_to_download:
+            with suppress(SkipTest):
+                dataset_fetchers[name]()
+
+    for item in items:
+        # Known failure on with GradientBoostingClassifier on ARM64
+        if (
+            item.name.endswith("GradientBoostingClassifier")
+            and platform.machine() == "aarch64"
+        ):
+            marker = pytest.mark.xfail(
+                reason=(
+                    "know failure. See "
+                    "https://github.com/scikit-learn/scikit-learn/issues/17797"  # noqa
+                )
+            )
+            item.add_marker(marker)
+
+    skip_doctests = False
+    try:
+        import matplotlib  # noqa
+    except ImportError:
+        skip_doctests = True
+        reason = "matplotlib is required to run the doctests"
+
+    if _IS_32BIT:
+        reason = "doctest are only run when the default numpy int is 64 bits."
+        skip_doctests = True
+    elif sys.platform.startswith("win32"):
+        reason = (
+            "doctests are not run for Windows because numpy arrays "
+            "repr is inconsistent across platforms."
+        )
+        skip_doctests = True
+
+    if np_base_version >= parse_version("2"):
+        reason = "Due to NEP 51 numpy scalar repr has changed in numpy 2"
+        skip_doctests = True
+
+    # Normally doctest has the entire module's scope. Here we set globs to an empty dict
+    # to remove the module's scope:
+    # https://docs.python.org/3/library/doctest.html#what-s-the-execution-context
+    for item in items:
+        if isinstance(item, DoctestItem):
+            item.dtest.globs = {}
+
+    if skip_doctests:
+        skip_marker = pytest.mark.skip(reason=reason)
+
+        for item in items:
+            if isinstance(item, DoctestItem):
+                # work-around an internal error with pytest if adding a skip
+                # mark to a doctest in a contextmanager, see
+                # https://github.com/pytest-dev/pytest/issues/8796 for more
+                # details.
+                if item.name != "sklearn._config.config_context":
+                    item.add_marker(skip_marker)
+    try:
+        import PIL  # noqa
+
+        pillow_installed = True
+    except ImportError:
+        pillow_installed = False
+
+    if not pillow_installed:
+        skip_marker = pytest.mark.skip(reason="pillow (or PIL) not installed!")
+        for item in items:
+            if item.name in [
+                "sklearn.feature_extraction.image.PatchExtractor",
+                "sklearn.feature_extraction.image.extract_patches_2d",
+            ]:
+                item.add_marker(skip_marker)
+
+
+@pytest.fixture(scope="function")
+def pyplot():
+    """Setup and teardown fixture for matplotlib.
+
+    This fixture checks if we can import matplotlib. If not, the tests will be
+    skipped. Otherwise, we close the figures before and after running the
+    functions.
+
+    Returns
+    -------
+    pyplot : module
+        The ``matplotlib.pyplot`` module.
+    """
+    pyplot = pytest.importorskip("matplotlib.pyplot")
+    pyplot.close("all")
+    yield pyplot
+    pyplot.close("all")
+
+
+def pytest_configure(config):
+    # Use matplotlib agg backend during the tests including doctests
+    try:
+        import matplotlib
+
+        matplotlib.use("agg")
+    except ImportError:
+        pass
+
+    allowed_parallelism = joblib.cpu_count(only_physical_cores=True)
+    xdist_worker_count = environ.get("PYTEST_XDIST_WORKER_COUNT")
+    if xdist_worker_count is not None:
+        # Set the number of OpenMP and BLAS threads based on the number of workers
+        # xdist is using to prevent oversubscription.
+        allowed_parallelism = max(allowed_parallelism // int(xdist_worker_count), 1)
+    threadpool_limits(allowed_parallelism)
+
+    # Register global_random_seed plugin if it is not already registered
+    if not config.pluginmanager.hasplugin("sklearn.tests.random_seed"):
+        config.pluginmanager.register(random_seed)
+
+    if environ.get("SKLEARN_WARNINGS_AS_ERRORS", "0") != "0":
+        # This seems like the only way to programmatically change the config
+        # filterwarnings. This was suggested in
+        # https://github.com/pytest-dev/pytest/issues/3311#issuecomment-373177592
+        for line in get_pytest_filterwarning_lines():
+            config.addinivalue_line("filterwarnings", line)
+
+
+@pytest.fixture
+def hide_available_pandas(monkeypatch):
+    """Pretend pandas was not installed."""
+    import_orig = builtins.__import__
+
+    def mocked_import(name, *args, **kwargs):
+        if name == "pandas":
+            raise ImportError()
+        return import_orig(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", mocked_import)
+
+
+@pytest.fixture
+def print_changed_only_false():
+    """Set `print_changed_only` to False for the duration of the test."""
+    set_config(print_changed_only=False)
+    yield
+    set_config(print_changed_only=True)  # reset to default
@@ -0,0 +1,44 @@
+"""
+The :mod:`sklearn.covariance` module includes methods and algorithms to
+robustly estimate the covariance of features given a set of points. The
+precision matrix defined as the inverse of the covariance is also estimated.
+Covariance estimation is closely related to the theory of Gaussian Graphical
+Models.
+"""
+
+from ._elliptic_envelope import EllipticEnvelope
+from ._empirical_covariance import (
+    EmpiricalCovariance,
+    empirical_covariance,
+    log_likelihood,
+)
+from ._graph_lasso import GraphicalLasso, GraphicalLassoCV, graphical_lasso
+from ._robust_covariance import MinCovDet, fast_mcd
+from ._shrunk_covariance import (
+    OAS,
+    LedoitWolf,
+    ShrunkCovariance,
+    ledoit_wolf,
+    ledoit_wolf_shrinkage,
+    oas,
+    shrunk_covariance,
+)
+
+__all__ = [
+    "EllipticEnvelope",
+    "EmpiricalCovariance",
+    "GraphicalLasso",
+    "GraphicalLassoCV",
+    "LedoitWolf",
+    "MinCovDet",
+    "OAS",
+    "ShrunkCovariance",
+    "empirical_covariance",
+    "fast_mcd",
+    "graphical_lasso",
+    "ledoit_wolf",
+    "ledoit_wolf_shrinkage",
+    "log_likelihood",
+    "oas",
+    "shrunk_covariance",
+]
@@ -0,0 +1,267 @@
+# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
+#
+# License: BSD 3 clause
+
+from numbers import Real
+
+import numpy as np
+
+from ..base import OutlierMixin, _fit_context
+from ..metrics import accuracy_score
+from ..utils._param_validation import Interval
+from ..utils.validation import check_is_fitted
+from ._robust_covariance import MinCovDet
+
+
+class EllipticEnvelope(OutlierMixin, MinCovDet):
+    """An object for detecting outliers in a Gaussian distributed dataset.
+
+    Read more in the :ref:`User Guide <outlier_detection>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, the support of robust location and covariance estimates
+        is computed, and a covariance estimate is recomputed from it,
+        without centering the data.
+        Useful to work with data whose mean is significantly equal to
+        zero but is not exactly zero.
+        If False, the robust location and covariance are directly computed
+        with the FastMCD algorithm without additional treatment.
+
+    support_fraction : float, default=None
+        The proportion of points to be included in the support of the raw
+        MCD estimate. If None, the minimum value of support_fraction will
+        be used within the algorithm: `(n_samples + n_features + 1) / 2 * n_samples`.
+        Range is (0, 1).
+
+    contamination : float, default=0.1
+        The amount of contamination of the data set, i.e. the proportion
+        of outliers in the data set. Range is (0, 0.5].
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling
+        the data. Pass an int for reproducible results across multiple function
+        calls. See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    location_ : ndarray of shape (n_features,)
+        Estimated robust location.
+
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated robust covariance matrix.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute the
+        robust estimates of location and shape.
+
+    offset_ : float
+        Offset used to define the decision function from the raw scores.
+        We have the relation: ``decision_function = score_samples - offset_``.
+        The offset depends on the contamination parameter and is defined in
+        such a way we obtain the expected number of outliers (samples with
+        decision function < 0) in training.
+
+        .. versionadded:: 0.20
+
+    raw_location_ : ndarray of shape (n_features,)
+        The raw robust estimated location before correction and re-weighting.
+
+    raw_covariance_ : ndarray of shape (n_features, n_features)
+        The raw robust estimated covariance before correction and re-weighting.
+
+    raw_support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute
+        the raw robust estimates of location and shape, before correction
+        and re-weighting.
+
+    dist_ : ndarray of shape (n_samples,)
+        Mahalanobis distances of the training set (on which :meth:`fit` is
+        called) observations.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    LedoitWolf : LedoitWolf Estimator.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    OAS : Oracle Approximating Shrinkage Estimator.
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    Notes
+    -----
+    Outlier detection from covariance estimation may break or not
+    perform well in high-dimensional settings. In particular, one will
+    always take care to work with ``n_samples > n_features ** 2``.
+
+    References
+    ----------
+    .. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the
+       minimum covariance determinant estimator" Technometrics 41(3), 212
+       (1999)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import EllipticEnvelope
+    >>> true_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0],
+    ...                                                  cov=true_cov,
+    ...                                                  size=500)
+    >>> cov = EllipticEnvelope(random_state=0).fit(X)
+    >>> # predict returns 1 for an inlier and -1 for an outlier
+    >>> cov.predict([[0, 0],
+    ...              [3, 3]])
+    array([ 1, -1])
+    >>> cov.covariance_
+    array([[0.7411..., 0.2535...],
+           [0.2535..., 0.3053...]])
+    >>> cov.location_
+    array([0.0813... , 0.0427...])
+    """
+
+    _parameter_constraints: dict = {
+        **MinCovDet._parameter_constraints,
+        "contamination": [Interval(Real, 0, 0.5, closed="right")],
+    }
+
+    def __init__(
+        self,
+        *,
+        store_precision=True,
+        assume_centered=False,
+        support_fraction=None,
+        contamination=0.1,
+        random_state=None,
+    ):
+        super().__init__(
+            store_precision=store_precision,
+            assume_centered=assume_centered,
+            support_fraction=support_fraction,
+            random_state=random_state,
+        )
+        self.contamination = contamination
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the EllipticEnvelope model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        super().fit(X)
+        self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination)
+        return self
+
+    def decision_function(self, X):
+        """Compute the decision function of the given observations.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        decision : ndarray of shape (n_samples,)
+            Decision function of the samples.
+            It is equal to the shifted Mahalanobis distances.
+            The threshold for being an outlier is 0, which ensures a
+            compatibility with other outlier detection algorithms.
+        """
+        check_is_fitted(self)
+        negative_mahal_dist = self.score_samples(X)
+        return negative_mahal_dist - self.offset_
+
+    def score_samples(self, X):
+        """Compute the negative Mahalanobis distances.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        negative_mahal_distances : array-like of shape (n_samples,)
+            Opposite of the Mahalanobis distances.
+        """
+        check_is_fitted(self)
+        return -self.mahalanobis(X)
+
+    def predict(self, X):
+        """
+        Predict labels (1 inlier, -1 outlier) of X according to fitted model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            Returns -1 for anomalies/outliers and +1 for inliers.
+        """
+        values = self.decision_function(X)
+        is_inlier = np.full(values.shape[0], -1, dtype=int)
+        is_inlier[values >= 0] = 1
+
+        return is_inlier
+
+    def score(self, X, y, sample_weight=None):
+        """Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of self.predict(X) w.r.t. y.
+        """
+        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
@@ -0,0 +1,364 @@
+"""
+Maximum likelihood covariance estimator.
+
+"""
+
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>
+#         Virgile Fritsch <virgile.fritsch@inria.fr>
+#
+# License: BSD 3 clause
+
+# avoid division truncation
+import warnings
+
+import numpy as np
+from scipy import linalg
+
+from .. import config_context
+from ..base import BaseEstimator, _fit_context
+from ..metrics.pairwise import pairwise_distances
+from ..utils import check_array
+from ..utils._param_validation import validate_params
+from ..utils.extmath import fast_logdet
+
+
+@validate_params(
+    {
+        "emp_cov": [np.ndarray],
+        "precision": [np.ndarray],
+    },
+    prefer_skip_nested_validation=True,
+)
+def log_likelihood(emp_cov, precision):
+    """Compute the sample mean of the log_likelihood under a covariance model.
+
+    Computes the empirical expected log-likelihood, allowing for universal
+    comparison (beyond this software package), and accounts for normalization
+    terms and scaling.
+
+    Parameters
+    ----------
+    emp_cov : ndarray of shape (n_features, n_features)
+        Maximum Likelihood Estimator of covariance.
+
+    precision : ndarray of shape (n_features, n_features)
+        The precision matrix of the covariance model to be tested.
+
+    Returns
+    -------
+    log_likelihood_ : float
+        Sample mean of the log-likelihood.
+    """
+    p = precision.shape[0]
+    log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision)
+    log_likelihood_ -= p * np.log(2 * np.pi)
+    log_likelihood_ /= 2.0
+    return log_likelihood_
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "assume_centered": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def empirical_covariance(X, *, assume_centered=False):
+    """Compute the Maximum likelihood covariance estimator.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Data from which to compute the covariance estimate.
+
+    assume_centered : bool, default=False
+        If `True`, data will not be centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If `False`, data will be centered before computation.
+
+    Returns
+    -------
+    covariance : ndarray of shape (n_features, n_features)
+        Empirical covariance (Maximum Likelihood Estimator).
+
+    Examples
+    --------
+    >>> from sklearn.covariance import empirical_covariance
+    >>> X = [[1,1,1],[1,1,1],[1,1,1],
+    ...      [0,0,0],[0,0,0],[0,0,0]]
+    >>> empirical_covariance(X)
+    array([[0.25, 0.25, 0.25],
+           [0.25, 0.25, 0.25],
+           [0.25, 0.25, 0.25]])
+    """
+    X = check_array(X, ensure_2d=False, force_all_finite=False)
+
+    if X.ndim == 1:
+        X = np.reshape(X, (1, -1))
+
+    if X.shape[0] == 1:
+        warnings.warn(
+            "Only one sample available. You may want to reshape your data array"
+        )
+
+    if assume_centered:
+        covariance = np.dot(X.T, X) / X.shape[0]
+    else:
+        covariance = np.cov(X.T, bias=1)
+
+    if covariance.ndim == 0:
+        covariance = np.array([[covariance]])
+    return covariance
+
+
+class EmpiricalCovariance(BaseEstimator):
+    """Maximum likelihood covariance estimator.
+
+    Read more in the :ref:`User Guide <covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specifies if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, data are not centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False (default), data are centered before computation.
+
+    Attributes
+    ----------
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo-inverse matrix.
+        (stored only if store_precision is True)
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    LedoitWolf : LedoitWolf Estimator.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    OAS : Oracle Approximating Shrinkage Estimator.
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import EmpiricalCovariance
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0],
+    ...                             cov=real_cov,
+    ...                             size=500)
+    >>> cov = EmpiricalCovariance().fit(X)
+    >>> cov.covariance_
+    array([[0.7569..., 0.2818...],
+           [0.2818..., 0.3928...]])
+    >>> cov.location_
+    array([0.0622..., 0.0193...])
+    """
+
+    _parameter_constraints: dict = {
+        "store_precision": ["boolean"],
+        "assume_centered": ["boolean"],
+    }
+
+    def __init__(self, *, store_precision=True, assume_centered=False):
+        self.store_precision = store_precision
+        self.assume_centered = assume_centered
+
+    def _set_covariance(self, covariance):
+        """Saves the covariance and precision estimates
+
+        Storage is done accordingly to `self.store_precision`.
+        Precision stored only if invertible.
+
+        Parameters
+        ----------
+        covariance : array-like of shape (n_features, n_features)
+            Estimated covariance matrix to be stored, and from which precision
+            is computed.
+        """
+        covariance = check_array(covariance)
+        # set covariance
+        self.covariance_ = covariance
+        # set precision
+        if self.store_precision:
+            self.precision_ = linalg.pinvh(covariance, check_finite=False)
+        else:
+            self.precision_ = None
+
+    def get_precision(self):
+        """Getter for the precision matrix.
+
+        Returns
+        -------
+        precision_ : array-like of shape (n_features, n_features)
+            The precision matrix associated to the current covariance object.
+        """
+        if self.store_precision:
+            precision = self.precision_
+        else:
+            precision = linalg.pinvh(self.covariance_, check_finite=False)
+        return precision
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the maximum likelihood covariance estimator to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+          Training data, where `n_samples` is the number of samples and
+          `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = self._validate_data(X)
+        if self.assume_centered:
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            self.location_ = X.mean(0)
+        covariance = empirical_covariance(X, assume_centered=self.assume_centered)
+        self._set_covariance(covariance)
+
+        return self
+
+    def score(self, X_test, y=None):
+        """Compute the log-likelihood of `X_test` under the estimated Gaussian model.
+
+        The Gaussian model is defined by its mean and covariance matrix which are
+        represented respectively by `self.location_` and `self.covariance_`.
+
+        Parameters
+        ----------
+        X_test : array-like of shape (n_samples, n_features)
+            Test data of which we compute the likelihood, where `n_samples` is
+            the number of samples and `n_features` is the number of features.
+            `X_test` is assumed to be drawn from the same distribution than
+            the data used in fit (including centering).
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        res : float
+            The log-likelihood of `X_test` with `self.location_` and `self.covariance_`
+            as estimators of the Gaussian model mean and covariance matrix respectively.
+        """
+        X_test = self._validate_data(X_test, reset=False)
+        # compute empirical covariance of the test set
+        test_cov = empirical_covariance(X_test - self.location_, assume_centered=True)
+        # compute log likelihood
+        res = log_likelihood(test_cov, self.get_precision())
+
+        return res
+
+    def error_norm(self, comp_cov, norm="frobenius", scaling=True, squared=True):
+        """Compute the Mean Squared Error between two covariance estimators.
+
+        Parameters
+        ----------
+        comp_cov : array-like of shape (n_features, n_features)
+            The covariance to compare with.
+
+        norm : {"frobenius", "spectral"}, default="frobenius"
+            The type of norm used to compute the error. Available error types:
+            - 'frobenius' (default): sqrt(tr(A^t.A))
+            - 'spectral': sqrt(max(eigenvalues(A^t.A))
+            where A is the error ``(comp_cov - self.covariance_)``.
+
+        scaling : bool, default=True
+            If True (default), the squared error norm is divided by n_features.
+            If False, the squared error norm is not rescaled.
+
+        squared : bool, default=True
+            Whether to compute the squared error norm or the error norm.
+            If True (default), the squared error norm is returned.
+            If False, the error norm is returned.
+
+        Returns
+        -------
+        result : float
+            The Mean Squared Error (in the sense of the Frobenius norm) between
+            `self` and `comp_cov` covariance estimators.
+        """
+        # compute the error
+        error = comp_cov - self.covariance_
+        # compute the error norm
+        if norm == "frobenius":
+            squared_norm = np.sum(error**2)
+        elif norm == "spectral":
+            squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))
+        else:
+            raise NotImplementedError(
+                "Only spectral and frobenius norms are implemented"
+            )
+        # optionally scale the error norm
+        if scaling:
+            squared_norm = squared_norm / error.shape[0]
+        # finally get either the squared norm or the norm
+        if squared:
+            result = squared_norm
+        else:
+            result = np.sqrt(squared_norm)
+
+        return result
+
+    def mahalanobis(self, X):
+        """Compute the squared Mahalanobis distances of given observations.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The observations, the Mahalanobis distances of the which we
+            compute. Observations are assumed to be drawn from the same
+            distribution than the data used in fit.
+
+        Returns
+        -------
+        dist : ndarray of shape (n_samples,)
+            Squared Mahalanobis distances of the observations.
+        """
+        X = self._validate_data(X, reset=False)
+
+        precision = self.get_precision()
+        with config_context(assume_finite=True):
+            # compute mahalanobis distances
+            dist = pairwise_distances(
+                X, self.location_[np.newaxis, :], metric="mahalanobis", VI=precision
+            )
+
+        return np.reshape(dist, (len(X),)) ** 2
@@ -0,0 +1,869 @@
+"""
+Robust location and covariance estimators.
+
+Here are implemented estimators that are resistant to outliers.
+
+"""
+
+# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
+#
+# License: BSD 3 clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+from scipy.stats import chi2
+
+from ..base import _fit_context
+from ..utils import check_array, check_random_state
+from ..utils._param_validation import Interval
+from ..utils.extmath import fast_logdet
+from ._empirical_covariance import EmpiricalCovariance, empirical_covariance
+
+
+# Minimum Covariance Determinant
+#   Implementing of an algorithm by Rousseeuw & Van Driessen described in
+#   (A Fast Algorithm for the Minimum Covariance Determinant Estimator,
+#   1999, American Statistical Association and the American Society
+#   for Quality, TECHNOMETRICS)
+# XXX Is this really a public function? It's not listed in the docs or
+# exported by sklearn.covariance. Deprecate?
+def c_step(
+    X,
+    n_support,
+    remaining_iterations=30,
+    initial_estimates=None,
+    verbose=False,
+    cov_computation_method=empirical_covariance,
+    random_state=None,
+):
+    """C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data set in which we look for the n_support observations whose
+        scatter matrix has minimum determinant.
+
+    n_support : int
+        Number of observations to compute the robust estimates of location
+        and covariance from. This parameter must be greater than
+        `n_samples / 2`.
+
+    remaining_iterations : int, default=30
+        Number of iterations to perform.
+        According to [Rouseeuw1999]_, two iterations are sufficient to get
+        close to the minimum, and we never need more than 30 to reach
+        convergence.
+
+    initial_estimates : tuple of shape (2,), default=None
+        Initial estimates of location and shape from which to run the c_step
+        procedure:
+        - initial_estimates[0]: an initial location estimate
+        - initial_estimates[1]: an initial covariance estimate
+
+    verbose : bool, default=False
+        Verbose mode.
+
+    cov_computation_method : callable, \
+            default=:func:`sklearn.covariance.empirical_covariance`
+        The function which will be used to compute the covariance.
+        Must return array of shape (n_features, n_features).
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    location : ndarray of shape (n_features,)
+        Robust location estimates.
+
+    covariance : ndarray of shape (n_features, n_features)
+        Robust covariance estimates.
+
+    support : ndarray of shape (n_samples,)
+        A mask for the `n_support` observations whose scatter matrix has
+        minimum determinant.
+
+    References
+    ----------
+    .. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant
+        Estimator, 1999, American Statistical Association and the American
+        Society for Quality, TECHNOMETRICS
+    """
+    X = np.asarray(X)
+    random_state = check_random_state(random_state)
+    return _c_step(
+        X,
+        n_support,
+        remaining_iterations=remaining_iterations,
+        initial_estimates=initial_estimates,
+        verbose=verbose,
+        cov_computation_method=cov_computation_method,
+        random_state=random_state,
+    )
+
+
+def _c_step(
+    X,
+    n_support,
+    random_state,
+    remaining_iterations=30,
+    initial_estimates=None,
+    verbose=False,
+    cov_computation_method=empirical_covariance,
+):
+    n_samples, n_features = X.shape
+    dist = np.inf
+
+    # Initialisation
+    support = np.zeros(n_samples, dtype=bool)
+    if initial_estimates is None:
+        # compute initial robust estimates from a random subset
+        support[random_state.permutation(n_samples)[:n_support]] = True
+    else:
+        # get initial robust estimates from the function parameters
+        location = initial_estimates[0]
+        covariance = initial_estimates[1]
+        # run a special iteration for that case (to get an initial support)
+        precision = linalg.pinvh(covariance)
+        X_centered = X - location
+        dist = (np.dot(X_centered, precision) * X_centered).sum(1)
+        # compute new estimates
+        support[np.argsort(dist)[:n_support]] = True
+
+    X_support = X[support]
+    location = X_support.mean(0)
+    covariance = cov_computation_method(X_support)
+
+    # Iterative procedure for Minimum Covariance Determinant computation
+    det = fast_logdet(covariance)
+    # If the data already has singular covariance, calculate the precision,
+    # as the loop below will not be entered.
+    if np.isinf(det):
+        precision = linalg.pinvh(covariance)
+
+    previous_det = np.inf
+    while det < previous_det and remaining_iterations > 0 and not np.isinf(det):
+        # save old estimates values
+        previous_location = location
+        previous_covariance = covariance
+        previous_det = det
+        previous_support = support
+        # compute a new support from the full data set mahalanobis distances
+        precision = linalg.pinvh(covariance)
+        X_centered = X - location
+        dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1)
+        # compute new estimates
+        support = np.zeros(n_samples, dtype=bool)
+        support[np.argsort(dist)[:n_support]] = True
+        X_support = X[support]
+        location = X_support.mean(axis=0)
+        covariance = cov_computation_method(X_support)
+        det = fast_logdet(covariance)
+        # update remaining iterations for early stopping
+        remaining_iterations -= 1
+
+    previous_dist = dist
+    dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1)
+    # Check if best fit already found (det => 0, logdet => -inf)
+    if np.isinf(det):
+        results = location, covariance, det, support, dist
+    # Check convergence
+    if np.allclose(det, previous_det):
+        # c_step procedure converged
+        if verbose:
+            print(
+                "Optimal couple (location, covariance) found before"
+                " ending iterations (%d left)" % (remaining_iterations)
+            )
+        results = location, covariance, det, support, dist
+    elif det > previous_det:
+        # determinant has increased (should not happen)
+        warnings.warn(
+            "Determinant has increased; this should not happen: "
+            "log(det) > log(previous_det) (%.15f > %.15f). "
+            "You may want to try with a higher value of "
+            "support_fraction (current value: %.3f)."
+            % (det, previous_det, n_support / n_samples),
+            RuntimeWarning,
+        )
+        results = (
+            previous_location,
+            previous_covariance,
+            previous_det,
+            previous_support,
+            previous_dist,
+        )
+
+    # Check early stopping
+    if remaining_iterations == 0:
+        if verbose:
+            print("Maximum number of iterations reached")
+        results = location, covariance, det, support, dist
+
+    return results
+
+
+def select_candidates(
+    X,
+    n_support,
+    n_trials,
+    select=1,
+    n_iter=30,
+    verbose=False,
+    cov_computation_method=empirical_covariance,
+    random_state=None,
+):
+    """Finds the best pure subset of observations to compute MCD from it.
+
+    The purpose of this function is to find the best sets of n_support
+    observations with respect to a minimization of their covariance
+    matrix determinant. Equivalently, it removes n_samples-n_support
+    observations to construct what we call a pure data set (i.e. not
+    containing outliers). The list of the observations of the pure
+    data set is referred to as the `support`.
+
+    Starting from a random support, the pure data set is found by the
+    c_step procedure introduced by Rousseeuw and Van Driessen in
+    [RV]_.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data (sub)set in which we look for the n_support purest observations.
+
+    n_support : int
+        The number of samples the pure data set must contain.
+        This parameter must be in the range `[(n + p + 1)/2] < n_support < n`.
+
+    n_trials : int or tuple of shape (2,)
+        Number of different initial sets of observations from which to
+        run the algorithm. This parameter should be a strictly positive
+        integer.
+        Instead of giving a number of trials to perform, one can provide a
+        list of initial estimates that will be used to iteratively run
+        c_step procedures. In this case:
+        - n_trials[0]: array-like, shape (n_trials, n_features)
+          is the list of `n_trials` initial location estimates
+        - n_trials[1]: array-like, shape (n_trials, n_features, n_features)
+          is the list of `n_trials` initial covariances estimates
+
+    select : int, default=1
+        Number of best candidates results to return. This parameter must be
+        a strictly positive integer.
+
+    n_iter : int, default=30
+        Maximum number of iterations for the c_step procedure.
+        (2 is enough to be close to the final solution. "Never" exceeds 20).
+        This parameter must be a strictly positive integer.
+
+    verbose : bool, default=False
+        Control the output verbosity.
+
+    cov_computation_method : callable, \
+            default=:func:`sklearn.covariance.empirical_covariance`
+        The function which will be used to compute the covariance.
+        Must return an array of shape (n_features, n_features).
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    See Also
+    ---------
+    c_step
+
+    Returns
+    -------
+    best_locations : ndarray of shape (select, n_features)
+        The `select` location estimates computed from the `select` best
+        supports found in the data set (`X`).
+
+    best_covariances : ndarray of shape (select, n_features, n_features)
+        The `select` covariance estimates computed from the `select`
+        best supports found in the data set (`X`).
+
+    best_supports : ndarray of shape (select, n_samples)
+        The `select` best supports found in the data set (`X`).
+
+    References
+    ----------
+    .. [RV] A Fast Algorithm for the Minimum Covariance Determinant
+        Estimator, 1999, American Statistical Association and the American
+        Society for Quality, TECHNOMETRICS
+    """
+    random_state = check_random_state(random_state)
+
+    if isinstance(n_trials, Integral):
+        run_from_estimates = False
+    elif isinstance(n_trials, tuple):
+        run_from_estimates = True
+        estimates_list = n_trials
+        n_trials = estimates_list[0].shape[0]
+    else:
+        raise TypeError(
+            "Invalid 'n_trials' parameter, expected tuple or  integer, got %s (%s)"
+            % (n_trials, type(n_trials))
+        )
+
+    # compute `n_trials` location and shape estimates candidates in the subset
+    all_estimates = []
+    if not run_from_estimates:
+        # perform `n_trials` computations from random initial supports
+        for j in range(n_trials):
+            all_estimates.append(
+                _c_step(
+                    X,
+                    n_support,
+                    remaining_iterations=n_iter,
+                    verbose=verbose,
+                    cov_computation_method=cov_computation_method,
+                    random_state=random_state,
+                )
+            )
+    else:
+        # perform computations from every given initial estimates
+        for j in range(n_trials):
+            initial_estimates = (estimates_list[0][j], estimates_list[1][j])
+            all_estimates.append(
+                _c_step(
+                    X,
+                    n_support,
+                    remaining_iterations=n_iter,
+                    initial_estimates=initial_estimates,
+                    verbose=verbose,
+                    cov_computation_method=cov_computation_method,
+                    random_state=random_state,
+                )
+            )
+    all_locs_sub, all_covs_sub, all_dets_sub, all_supports_sub, all_ds_sub = zip(
+        *all_estimates
+    )
+    # find the `n_best` best results among the `n_trials` ones
+    index_best = np.argsort(all_dets_sub)[:select]
+    best_locations = np.asarray(all_locs_sub)[index_best]
+    best_covariances = np.asarray(all_covs_sub)[index_best]
+    best_supports = np.asarray(all_supports_sub)[index_best]
+    best_ds = np.asarray(all_ds_sub)[index_best]
+
+    return best_locations, best_covariances, best_supports, best_ds
+
+
+def fast_mcd(
+    X,
+    support_fraction=None,
+    cov_computation_method=empirical_covariance,
+    random_state=None,
+):
+    """Estimate the Minimum Covariance Determinant matrix.
+
+    Read more in the :ref:`User Guide <robust_covariance>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        The data matrix, with p features and n samples.
+
+    support_fraction : float, default=None
+        The proportion of points to be included in the support of the raw
+        MCD estimate. Default is `None`, which implies that the minimum
+        value of `support_fraction` will be used within the algorithm:
+        `(n_samples + n_features + 1) / 2 * n_samples`. This parameter must be
+        in the range (0, 1).
+
+    cov_computation_method : callable, \
+            default=:func:`sklearn.covariance.empirical_covariance`
+        The function which will be used to compute the covariance.
+        Must return an array of shape (n_features, n_features).
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    location : ndarray of shape (n_features,)
+        Robust location of the data.
+
+    covariance : ndarray of shape (n_features, n_features)
+        Robust covariance of the features.
+
+    support : ndarray of shape (n_samples,), dtype=bool
+        A mask of the observations that have been used to compute
+        the robust location and covariance estimates of the data set.
+
+    Notes
+    -----
+    The FastMCD algorithm has been introduced by Rousseuw and Van Driessen
+    in "A Fast Algorithm for the Minimum Covariance Determinant Estimator,
+    1999, American Statistical Association and the American Society
+    for Quality, TECHNOMETRICS".
+    The principle is to compute robust estimates and random subsets before
+    pooling them into a larger subsets, and finally into the full data set.
+    Depending on the size of the initial sample, we have one, two or three
+    such computation levels.
+
+    Note that only raw estimates are returned. If one is interested in
+    the correction and reweighting steps described in [RouseeuwVan]_,
+    see the MinCovDet object.
+
+    References
+    ----------
+
+    .. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance
+        Determinant Estimator, 1999, American Statistical Association
+        and the American Society for Quality, TECHNOMETRICS
+
+    .. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun,
+        Asymptotics For The Minimum Covariance Determinant Estimator,
+        The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
+    """
+    random_state = check_random_state(random_state)
+
+    X = check_array(X, ensure_min_samples=2, estimator="fast_mcd")
+    n_samples, n_features = X.shape
+
+    # minimum breakdown value
+    if support_fraction is None:
+        n_support = int(np.ceil(0.5 * (n_samples + n_features + 1)))
+    else:
+        n_support = int(support_fraction * n_samples)
+
+    # 1-dimensional case quick computation
+    # (Rousseeuw, P. J. and Leroy, A. M. (2005) References, in Robust
+    #  Regression and Outlier Detection, John Wiley & Sons, chapter 4)
+    if n_features == 1:
+        if n_support < n_samples:
+            # find the sample shortest halves
+            X_sorted = np.sort(np.ravel(X))
+            diff = X_sorted[n_support:] - X_sorted[: (n_samples - n_support)]
+            halves_start = np.where(diff == np.min(diff))[0]
+            # take the middle points' mean to get the robust location estimate
+            location = (
+                0.5
+                * (X_sorted[n_support + halves_start] + X_sorted[halves_start]).mean()
+            )
+            support = np.zeros(n_samples, dtype=bool)
+            X_centered = X - location
+            support[np.argsort(np.abs(X_centered), 0)[:n_support]] = True
+            covariance = np.asarray([[np.var(X[support])]])
+            location = np.array([location])
+            # get precision matrix in an optimized way
+            precision = linalg.pinvh(covariance)
+            dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
+        else:
+            support = np.ones(n_samples, dtype=bool)
+            covariance = np.asarray([[np.var(X)]])
+            location = np.asarray([np.mean(X)])
+            X_centered = X - location
+            # get precision matrix in an optimized way
+            precision = linalg.pinvh(covariance)
+            dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
+    # Starting FastMCD algorithm for p-dimensional case
+    if (n_samples > 500) and (n_features > 1):
+        # 1. Find candidate supports on subsets
+        # a. split the set in subsets of size ~ 300
+        n_subsets = n_samples // 300
+        n_samples_subsets = n_samples // n_subsets
+        samples_shuffle = random_state.permutation(n_samples)
+        h_subset = int(np.ceil(n_samples_subsets * (n_support / float(n_samples))))
+        # b. perform a total of 500 trials
+        n_trials_tot = 500
+        # c. select 10 best (location, covariance) for each subset
+        n_best_sub = 10
+        n_trials = max(10, n_trials_tot // n_subsets)
+        n_best_tot = n_subsets * n_best_sub
+        all_best_locations = np.zeros((n_best_tot, n_features))
+        try:
+            all_best_covariances = np.zeros((n_best_tot, n_features, n_features))
+        except MemoryError:
+            # The above is too big. Let's try with something much small
+            # (and less optimal)
+            n_best_tot = 10
+            all_best_covariances = np.zeros((n_best_tot, n_features, n_features))
+            n_best_sub = 2
+        for i in range(n_subsets):
+            low_bound = i * n_samples_subsets
+            high_bound = low_bound + n_samples_subsets
+            current_subset = X[samples_shuffle[low_bound:high_bound]]
+            best_locations_sub, best_covariances_sub, _, _ = select_candidates(
+                current_subset,
+                h_subset,
+                n_trials,
+                select=n_best_sub,
+                n_iter=2,
+                cov_computation_method=cov_computation_method,
+                random_state=random_state,
+            )
+            subset_slice = np.arange(i * n_best_sub, (i + 1) * n_best_sub)
+            all_best_locations[subset_slice] = best_locations_sub
+            all_best_covariances[subset_slice] = best_covariances_sub
+        # 2. Pool the candidate supports into a merged set
+        # (possibly the full dataset)
+        n_samples_merged = min(1500, n_samples)
+        h_merged = int(np.ceil(n_samples_merged * (n_support / float(n_samples))))
+        if n_samples > 1500:
+            n_best_merged = 10
+        else:
+            n_best_merged = 1
+        # find the best couples (location, covariance) on the merged set
+        selection = random_state.permutation(n_samples)[:n_samples_merged]
+        locations_merged, covariances_merged, supports_merged, d = select_candidates(
+            X[selection],
+            h_merged,
+            n_trials=(all_best_locations, all_best_covariances),
+            select=n_best_merged,
+            cov_computation_method=cov_computation_method,
+            random_state=random_state,
+        )
+        # 3. Finally get the overall best (locations, covariance) couple
+        if n_samples < 1500:
+            # directly get the best couple (location, covariance)
+            location = locations_merged[0]
+            covariance = covariances_merged[0]
+            support = np.zeros(n_samples, dtype=bool)
+            dist = np.zeros(n_samples)
+            support[selection] = supports_merged[0]
+            dist[selection] = d[0]
+        else:
+            # select the best couple on the full dataset
+            locations_full, covariances_full, supports_full, d = select_candidates(
+                X,
+                n_support,
+                n_trials=(locations_merged, covariances_merged),
+                select=1,
+                cov_computation_method=cov_computation_method,
+                random_state=random_state,
+            )
+            location = locations_full[0]
+            covariance = covariances_full[0]
+            support = supports_full[0]
+            dist = d[0]
+    elif n_features > 1:
+        # 1. Find the 10 best couples (location, covariance)
+        # considering two iterations
+        n_trials = 30
+        n_best = 10
+        locations_best, covariances_best, _, _ = select_candidates(
+            X,
+            n_support,
+            n_trials=n_trials,
+            select=n_best,
+            n_iter=2,
+            cov_computation_method=cov_computation_method,
+            random_state=random_state,
+        )
+        # 2. Select the best couple on the full dataset amongst the 10
+        locations_full, covariances_full, supports_full, d = select_candidates(
+            X,
+            n_support,
+            n_trials=(locations_best, covariances_best),
+            select=1,
+            cov_computation_method=cov_computation_method,
+            random_state=random_state,
+        )
+        location = locations_full[0]
+        covariance = covariances_full[0]
+        support = supports_full[0]
+        dist = d[0]
+
+    return location, covariance, support, dist
+
+
+class MinCovDet(EmpiricalCovariance):
+    """Minimum Covariance Determinant (MCD): robust estimator of covariance.
+
+    The Minimum Covariance Determinant covariance estimator is to be applied
+    on Gaussian-distributed data, but could still be relevant on data
+    drawn from a unimodal, symmetric distribution. It is not meant to be used
+    with multi-modal data (the algorithm used to fit a MinCovDet object is
+    likely to fail in such a case).
+    One should consider projection pursuit methods to deal with multi-modal
+    datasets.
+
+    Read more in the :ref:`User Guide <robust_covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, the support of the robust location and the covariance
+        estimates is computed, and a covariance estimate is recomputed from
+        it, without centering the data.
+        Useful to work with data whose mean is significantly equal to
+        zero but is not exactly zero.
+        If False, the robust location and covariance are directly computed
+        with the FastMCD algorithm without additional treatment.
+
+    support_fraction : float, default=None
+        The proportion of points to be included in the support of the raw
+        MCD estimate. Default is None, which implies that the minimum
+        value of support_fraction will be used within the algorithm:
+        `(n_samples + n_features + 1) / 2 * n_samples`. The parameter must be
+        in the range (0, 1].
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the pseudo random number generator for shuffling the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    raw_location_ : ndarray of shape (n_features,)
+        The raw robust estimated location before correction and re-weighting.
+
+    raw_covariance_ : ndarray of shape (n_features, n_features)
+        The raw robust estimated covariance before correction and re-weighting.
+
+    raw_support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute
+        the raw robust estimates of location and shape, before correction
+        and re-weighting.
+
+    location_ : ndarray of shape (n_features,)
+        Estimated robust location.
+
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated robust covariance matrix.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute
+        the robust estimates of location and shape.
+
+    dist_ : ndarray of shape (n_samples,)
+        Mahalanobis distances of the training set (on which :meth:`fit` is
+        called) observations.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    GraphicalLassoCV : Sparse inverse covariance with cross-validated
+        choice of the l1 penalty.
+    LedoitWolf : LedoitWolf Estimator.
+    OAS : Oracle Approximating Shrinkage Estimator.
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    References
+    ----------
+
+    .. [Rouseeuw1984] P. J. Rousseeuw. Least median of squares regression.
+        J. Am Stat Ass, 79:871, 1984.
+    .. [Rousseeuw] A Fast Algorithm for the Minimum Covariance Determinant
+        Estimator, 1999, American Statistical Association and the American
+        Society for Quality, TECHNOMETRICS
+    .. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun,
+        Asymptotics For The Minimum Covariance Determinant Estimator,
+        The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import MinCovDet
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0],
+    ...                                   cov=real_cov,
+    ...                                   size=500)
+    >>> cov = MinCovDet(random_state=0).fit(X)
+    >>> cov.covariance_
+    array([[0.7411..., 0.2535...],
+           [0.2535..., 0.3053...]])
+    >>> cov.location_
+    array([0.0813... , 0.0427...])
+    """
+
+    _parameter_constraints: dict = {
+        **EmpiricalCovariance._parameter_constraints,
+        "support_fraction": [Interval(Real, 0, 1, closed="right"), None],
+        "random_state": ["random_state"],
+    }
+    _nonrobust_covariance = staticmethod(empirical_covariance)
+
+    def __init__(
+        self,
+        *,
+        store_precision=True,
+        assume_centered=False,
+        support_fraction=None,
+        random_state=None,
+    ):
+        self.store_precision = store_precision
+        self.assume_centered = assume_centered
+        self.support_fraction = support_fraction
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit a Minimum Covariance Determinant with the FastMCD algorithm.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = self._validate_data(X, ensure_min_samples=2, estimator="MinCovDet")
+        random_state = check_random_state(self.random_state)
+        n_samples, n_features = X.shape
+        # check that the empirical covariance is full rank
+        if (linalg.svdvals(np.dot(X.T, X)) > 1e-8).sum() != n_features:
+            warnings.warn(
+                "The covariance matrix associated to your dataset is not full rank"
+            )
+        # compute and store raw estimates
+        raw_location, raw_covariance, raw_support, raw_dist = fast_mcd(
+            X,
+            support_fraction=self.support_fraction,
+            cov_computation_method=self._nonrobust_covariance,
+            random_state=random_state,
+        )
+        if self.assume_centered:
+            raw_location = np.zeros(n_features)
+            raw_covariance = self._nonrobust_covariance(
+                X[raw_support], assume_centered=True
+            )
+            # get precision matrix in an optimized way
+            precision = linalg.pinvh(raw_covariance)
+            raw_dist = np.sum(np.dot(X, precision) * X, 1)
+        self.raw_location_ = raw_location
+        self.raw_covariance_ = raw_covariance
+        self.raw_support_ = raw_support
+        self.location_ = raw_location
+        self.support_ = raw_support
+        self.dist_ = raw_dist
+        # obtain consistency at normal models
+        self.correct_covariance(X)
+        # re-weight estimator
+        self.reweight_covariance(X)
+
+        return self
+
+    def correct_covariance(self, data):
+        """Apply a correction to raw Minimum Covariance Determinant estimates.
+
+        Correction using the empirical correction factor suggested
+        by Rousseeuw and Van Driessen in [RVD]_.
+
+        Parameters
+        ----------
+        data : array-like of shape (n_samples, n_features)
+            The data matrix, with p features and n samples.
+            The data set must be the one which was used to compute
+            the raw estimates.
+
+        Returns
+        -------
+        covariance_corrected : ndarray of shape (n_features, n_features)
+            Corrected robust covariance estimate.
+
+        References
+        ----------
+
+        .. [RVD] A Fast Algorithm for the Minimum Covariance
+            Determinant Estimator, 1999, American Statistical Association
+            and the American Society for Quality, TECHNOMETRICS
+        """
+
+        # Check that the covariance of the support data is not equal to 0.
+        # Otherwise self.dist_ = 0 and thus correction = 0.
+        n_samples = len(self.dist_)
+        n_support = np.sum(self.support_)
+        if n_support < n_samples and np.allclose(self.raw_covariance_, 0):
+            raise ValueError(
+                "The covariance matrix of the support data "
+                "is equal to 0, try to increase support_fraction"
+            )
+        correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5)
+        covariance_corrected = self.raw_covariance_ * correction
+        self.dist_ /= correction
+        return covariance_corrected
+
+    def reweight_covariance(self, data):
+        """Re-weight raw Minimum Covariance Determinant estimates.
+
+        Re-weight observations using Rousseeuw's method (equivalent to
+        deleting outlying observations from the data set before
+        computing location and covariance estimates) described
+        in [RVDriessen]_.
+
+        Parameters
+        ----------
+        data : array-like of shape (n_samples, n_features)
+            The data matrix, with p features and n samples.
+            The data set must be the one which was used to compute
+            the raw estimates.
+
+        Returns
+        -------
+        location_reweighted : ndarray of shape (n_features,)
+            Re-weighted robust location estimate.
+
+        covariance_reweighted : ndarray of shape (n_features, n_features)
+            Re-weighted robust covariance estimate.
+
+        support_reweighted : ndarray of shape (n_samples,), dtype=bool
+            A mask of the observations that have been used to compute
+            the re-weighted robust location and covariance estimates.
+
+        References
+        ----------
+
+        .. [RVDriessen] A Fast Algorithm for the Minimum Covariance
+            Determinant Estimator, 1999, American Statistical Association
+            and the American Society for Quality, TECHNOMETRICS
+        """
+        n_samples, n_features = data.shape
+        mask = self.dist_ < chi2(n_features).isf(0.025)
+        if self.assume_centered:
+            location_reweighted = np.zeros(n_features)
+        else:
+            location_reweighted = data[mask].mean(0)
+        covariance_reweighted = self._nonrobust_covariance(
+            data[mask], assume_centered=self.assume_centered
+        )
+        support_reweighted = np.zeros(n_samples, dtype=bool)
+        support_reweighted[mask] = True
+        self._set_covariance(covariance_reweighted)
+        self.location_ = location_reweighted
+        self.support_ = support_reweighted
+        X_centered = data - self.location_
+        self.dist_ = np.sum(np.dot(X_centered, self.get_precision()) * X_centered, 1)
+        return location_reweighted, covariance_reweighted, support_reweighted
@@ -0,0 +1,816 @@
+"""
+Covariance estimators using shrinkage.
+
+Shrinkage corresponds to regularising `cov` using a convex combination:
+shrunk_cov = (1-shrinkage)*cov + shrinkage*structured_estimate.
+
+"""
+
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>
+#         Virgile Fritsch <virgile.fritsch@inria.fr>
+#
+# License: BSD 3 clause
+
+# avoid division truncation
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import _fit_context
+from ..utils import check_array
+from ..utils._param_validation import Interval, validate_params
+from . import EmpiricalCovariance, empirical_covariance
+
+
+def _ledoit_wolf(X, *, assume_centered, block_size):
+    """Estimate the shrunk Ledoit-Wolf covariance matrix."""
+    # for only one feature, the result is the same whatever the shrinkage
+    if len(X.shape) == 2 and X.shape[1] == 1:
+        if not assume_centered:
+            X = X - X.mean()
+        return np.atleast_2d((X**2).mean()), 0.0
+    n_features = X.shape[1]
+
+    # get Ledoit-Wolf shrinkage
+    shrinkage = ledoit_wolf_shrinkage(
+        X, assume_centered=assume_centered, block_size=block_size
+    )
+    emp_cov = empirical_covariance(X, assume_centered=assume_centered)
+    mu = np.sum(np.trace(emp_cov)) / n_features
+    shrunk_cov = (1.0 - shrinkage) * emp_cov
+    shrunk_cov.flat[:: n_features + 1] += shrinkage * mu
+
+    return shrunk_cov, shrinkage
+
+
+def _oas(X, *, assume_centered=False):
+    """Estimate covariance with the Oracle Approximating Shrinkage algorithm.
+
+    The formulation is based on [1]_.
+    [1] "Shrinkage algorithms for MMSE covariance estimation.",
+        Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
+        IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
+        https://arxiv.org/pdf/0907.4698.pdf
+    """
+    if len(X.shape) == 2 and X.shape[1] == 1:
+        # for only one feature, the result is the same whatever the shrinkage
+        if not assume_centered:
+            X = X - X.mean()
+        return np.atleast_2d((X**2).mean()), 0.0
+
+    n_samples, n_features = X.shape
+
+    emp_cov = empirical_covariance(X, assume_centered=assume_centered)
+
+    # The shrinkage is defined as:
+    # shrinkage = min(
+    # trace(S @ S.T) + trace(S)**2) / ((n + 1) (trace(S @ S.T) - trace(S)**2 / p), 1
+    # )
+    # where n and p are n_samples and n_features, respectively (cf. Eq. 23 in [1]).
+    # The factor 2 / p is omitted since it does not impact the value of the estimator
+    # for large p.
+
+    # Instead of computing trace(S)**2, we can compute the average of the squared
+    # elements of S that is equal to trace(S)**2 / p**2.
+    # See the definition of the Frobenius norm:
+    # https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm
+    alpha = np.mean(emp_cov**2)
+    mu = np.trace(emp_cov) / n_features
+    mu_squared = mu**2
+
+    # The factor 1 / p**2 will cancel out since it is in both the numerator and
+    # denominator
+    num = alpha + mu_squared
+    den = (n_samples + 1) * (alpha - mu_squared / n_features)
+    shrinkage = 1.0 if den == 0 else min(num / den, 1.0)
+
+    # The shrunk covariance is defined as:
+    # (1 - shrinkage) * S + shrinkage * F (cf. Eq. 4 in [1])
+    # where S is the empirical covariance and F is the shrinkage target defined as
+    # F = trace(S) / n_features * np.identity(n_features) (cf. Eq. 3 in [1])
+    shrunk_cov = (1.0 - shrinkage) * emp_cov
+    shrunk_cov.flat[:: n_features + 1] += shrinkage * mu
+
+    return shrunk_cov, shrinkage
+
+
+###############################################################################
+# Public API
+# ShrunkCovariance estimator
+
+
+@validate_params(
+    {
+        "emp_cov": ["array-like"],
+        "shrinkage": [Interval(Real, 0, 1, closed="both")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def shrunk_covariance(emp_cov, shrinkage=0.1):
+    """Calculate covariance matrices shrunk on the diagonal.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    emp_cov : array-like of shape (..., n_features, n_features)
+        Covariance matrices to be shrunk, at least 2D ndarray.
+
+    shrinkage : float, default=0.1
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate. Range is [0, 1].
+
+    Returns
+    -------
+    shrunk_cov : ndarray of shape (..., n_features, n_features)
+        Shrunk covariance matrices.
+
+    Notes
+    -----
+    The regularized (shrunk) covariance is given by::
+
+        (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where `mu = trace(cov) / n_features`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> from sklearn.covariance import empirical_covariance, shrunk_covariance
+    >>> real_cov = np.array([[.8, .3], [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500)
+    >>> shrunk_covariance(empirical_covariance(X))
+    array([[0.73..., 0.25...],
+           [0.25..., 0.41...]])
+    """
+    emp_cov = check_array(emp_cov, allow_nd=True)
+    n_features = emp_cov.shape[-1]
+
+    shrunk_cov = (1.0 - shrinkage) * emp_cov
+    mu = np.trace(emp_cov, axis1=-2, axis2=-1) / n_features
+    mu = np.expand_dims(mu, axis=tuple(range(mu.ndim, emp_cov.ndim)))
+    shrunk_cov += shrinkage * mu * np.eye(n_features)
+
+    return shrunk_cov
+
+
+class ShrunkCovariance(EmpiricalCovariance):
+    """Covariance estimator with shrinkage.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False, data will be centered before computation.
+
+    shrinkage : float, default=0.1
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate. Range is [0, 1].
+
+    Attributes
+    ----------
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix
+
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    GraphicalLassoCV : Sparse inverse covariance with cross-validated
+        choice of the l1 penalty.
+    LedoitWolf : LedoitWolf Estimator.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    OAS : Oracle Approximating Shrinkage Estimator.
+
+    Notes
+    -----
+    The regularized covariance is given by:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where mu = trace(cov) / n_features
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import ShrunkCovariance
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0],
+    ...                                   cov=real_cov,
+    ...                                   size=500)
+    >>> cov = ShrunkCovariance().fit(X)
+    >>> cov.covariance_
+    array([[0.7387..., 0.2536...],
+           [0.2536..., 0.4110...]])
+    >>> cov.location_
+    array([0.0622..., 0.0193...])
+    """
+
+    _parameter_constraints: dict = {
+        **EmpiricalCovariance._parameter_constraints,
+        "shrinkage": [Interval(Real, 0, 1, closed="both")],
+    }
+
+    def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1):
+        super().__init__(
+            store_precision=store_precision, assume_centered=assume_centered
+        )
+        self.shrinkage = shrinkage
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the shrunk covariance model to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = self._validate_data(X)
+        # Not calling the parent object to fit, to avoid a potential
+        # matrix inversion when setting the precision
+        if self.assume_centered:
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            self.location_ = X.mean(0)
+        covariance = empirical_covariance(X, assume_centered=self.assume_centered)
+        covariance = shrunk_covariance(covariance, self.shrinkage)
+        self._set_covariance(covariance)
+
+        return self
+
+
+# Ledoit-Wolf estimator
+
+
+@validate_params(
+    {
+        "X": ["array-like"],
+        "assume_centered": ["boolean"],
+        "block_size": [Interval(Integral, 1, None, closed="left")],
+    },
+    prefer_skip_nested_validation=True,
+)
+def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000):
+    """Estimate the shrunk Ledoit-Wolf covariance matrix.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful to work with data whose mean is significantly equal to
+        zero but is not exactly zero.
+        If False, data will be centered before computation.
+
+    block_size : int, default=1000
+        Size of blocks into which the covariance matrix will be split.
+
+    Returns
+    -------
+    shrinkage : float
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate.
+
+    Notes
+    -----
+    The regularized (shrunk) covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where mu = trace(cov) / n_features
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import ledoit_wolf_shrinkage
+    >>> real_cov = np.array([[.4, .2], [.2, .8]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50)
+    >>> shrinkage_coefficient = ledoit_wolf_shrinkage(X)
+    >>> shrinkage_coefficient
+    0.23...
+    """
+    X = check_array(X)
+    # for only one feature, the result is the same whatever the shrinkage
+    if len(X.shape) == 2 and X.shape[1] == 1:
+        return 0.0
+    if X.ndim == 1:
+        X = np.reshape(X, (1, -1))
+
+    if X.shape[0] == 1:
+        warnings.warn(
+            "Only one sample available. You may want to reshape your data array"
+        )
+    n_samples, n_features = X.shape
+
+    # optionally center data
+    if not assume_centered:
+        X = X - X.mean(0)
+
+    # A non-blocked version of the computation is present in the tests
+    # in tests/test_covariance.py
+
+    # number of blocks to split the covariance matrix into
+    n_splits = int(n_features / block_size)
+    X2 = X**2
+    emp_cov_trace = np.sum(X2, axis=0) / n_samples
+    mu = np.sum(emp_cov_trace) / n_features
+    beta_ = 0.0  # sum of the coefficients of <X2.T, X2>
+    delta_ = 0.0  # sum of the *squared* coefficients of <X.T, X>
+    # starting block computation
+    for i in range(n_splits):
+        for j in range(n_splits):
+            rows = slice(block_size * i, block_size * (i + 1))
+            cols = slice(block_size * j, block_size * (j + 1))
+            beta_ += np.sum(np.dot(X2.T[rows], X2[:, cols]))
+            delta_ += np.sum(np.dot(X.T[rows], X[:, cols]) ** 2)
+        rows = slice(block_size * i, block_size * (i + 1))
+        beta_ += np.sum(np.dot(X2.T[rows], X2[:, block_size * n_splits :]))
+        delta_ += np.sum(np.dot(X.T[rows], X[:, block_size * n_splits :]) ** 2)
+    for j in range(n_splits):
+        cols = slice(block_size * j, block_size * (j + 1))
+        beta_ += np.sum(np.dot(X2.T[block_size * n_splits :], X2[:, cols]))
+        delta_ += np.sum(np.dot(X.T[block_size * n_splits :], X[:, cols]) ** 2)
+    delta_ += np.sum(
+        np.dot(X.T[block_size * n_splits :], X[:, block_size * n_splits :]) ** 2
+    )
+    delta_ /= n_samples**2
+    beta_ += np.sum(
+        np.dot(X2.T[block_size * n_splits :], X2[:, block_size * n_splits :])
+    )
+    # use delta_ to compute beta
+    beta = 1.0 / (n_features * n_samples) * (beta_ / n_samples - delta_)
+    # delta is the sum of the squared coefficients of (<X.T,X> - mu*Id) / p
+    delta = delta_ - 2.0 * mu * emp_cov_trace.sum() + n_features * mu**2
+    delta /= n_features
+    # get final beta as the min between beta and delta
+    # We do this to prevent shrinking more than "1", which would invert
+    # the value of covariances
+    beta = min(beta, delta)
+    # finally get shrinkage
+    shrinkage = 0 if beta == 0 else beta / delta
+    return shrinkage
+
+
+@validate_params(
+    {"X": ["array-like"]},
+    prefer_skip_nested_validation=False,
+)
+def ledoit_wolf(X, *, assume_centered=False, block_size=1000):
+    """Estimate the shrunk Ledoit-Wolf covariance matrix.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data from which to compute the covariance estimate.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful to work with data whose mean is significantly equal to
+        zero but is not exactly zero.
+        If False, data will be centered before computation.
+
+    block_size : int, default=1000
+        Size of blocks into which the covariance matrix will be split.
+        This is purely a memory optimization and does not affect results.
+
+    Returns
+    -------
+    shrunk_cov : ndarray of shape (n_features, n_features)
+        Shrunk covariance.
+
+    shrinkage : float
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate.
+
+    Notes
+    -----
+    The regularized (shrunk) covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where mu = trace(cov) / n_features
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import empirical_covariance, ledoit_wolf
+    >>> real_cov = np.array([[.4, .2], [.2, .8]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50)
+    >>> covariance, shrinkage = ledoit_wolf(X)
+    >>> covariance
+    array([[0.44..., 0.16...],
+           [0.16..., 0.80...]])
+    >>> shrinkage
+    0.23...
+    """
+    estimator = LedoitWolf(
+        assume_centered=assume_centered,
+        block_size=block_size,
+        store_precision=False,
+    ).fit(X)
+
+    return estimator.covariance_, estimator.shrinkage_
+
+
+class LedoitWolf(EmpiricalCovariance):
+    """LedoitWolf Estimator.
+
+    Ledoit-Wolf is a particular form of shrinkage, where the shrinkage
+    coefficient is computed using O. Ledoit and M. Wolf's formula as
+    described in "A Well-Conditioned Estimator for Large-Dimensional
+    Covariance Matrices", Ledoit and Wolf, Journal of Multivariate
+    Analysis, Volume 88, Issue 2, February 2004, pages 365-411.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False (default), data will be centered before computation.
+
+    block_size : int, default=1000
+        Size of blocks into which the covariance matrix will be split
+        during its Ledoit-Wolf estimation. This is purely a memory
+        optimization and does not affect results.
+
+    Attributes
+    ----------
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix.
+
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    shrinkage_ : float
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate. Range is [0, 1].
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    GraphicalLassoCV : Sparse inverse covariance with cross-validated
+        choice of the l1 penalty.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    OAS : Oracle Approximating Shrinkage Estimator.
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    Notes
+    -----
+    The regularised covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
+
+    where mu = trace(cov) / n_features
+    and shrinkage is given by the Ledoit and Wolf formula (see References)
+
+    References
+    ----------
+    "A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices",
+    Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,
+    February 2004, pages 365-411.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import LedoitWolf
+    >>> real_cov = np.array([[.4, .2],
+    ...                      [.2, .8]])
+    >>> np.random.seed(0)
+    >>> X = np.random.multivariate_normal(mean=[0, 0],
+    ...                                   cov=real_cov,
+    ...                                   size=50)
+    >>> cov = LedoitWolf().fit(X)
+    >>> cov.covariance_
+    array([[0.4406..., 0.1616...],
+           [0.1616..., 0.8022...]])
+    >>> cov.location_
+    array([ 0.0595... , -0.0075...])
+    """
+
+    _parameter_constraints: dict = {
+        **EmpiricalCovariance._parameter_constraints,
+        "block_size": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    def __init__(self, *, store_precision=True, assume_centered=False, block_size=1000):
+        super().__init__(
+            store_precision=store_precision, assume_centered=assume_centered
+        )
+        self.block_size = block_size
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the Ledoit-Wolf shrunk covariance model to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        # Not calling the parent object to fit, to avoid computing the
+        # covariance matrix (and potentially the precision)
+        X = self._validate_data(X)
+        if self.assume_centered:
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            self.location_ = X.mean(0)
+        covariance, shrinkage = _ledoit_wolf(
+            X - self.location_, assume_centered=True, block_size=self.block_size
+        )
+        self.shrinkage_ = shrinkage
+        self._set_covariance(covariance)
+
+        return self
+
+
+# OAS estimator
+@validate_params(
+    {"X": ["array-like"]},
+    prefer_skip_nested_validation=False,
+)
+def oas(X, *, assume_centered=False):
+    """Estimate covariance with the Oracle Approximating Shrinkage as proposed in [1]_.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Data from which to compute the covariance estimate.
+
+    assume_centered : bool, default=False
+      If True, data will not be centered before computation.
+      Useful to work with data whose mean is significantly equal to
+      zero but is not exactly zero.
+      If False, data will be centered before computation.
+
+    Returns
+    -------
+    shrunk_cov : array-like of shape (n_features, n_features)
+        Shrunk covariance.
+
+    shrinkage : float
+        Coefficient in the convex combination used for the computation
+        of the shrunk estimate.
+
+    Notes
+    -----
+    The regularised covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
+
+    where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
+    (see [1]_).
+
+    The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
+    the original article, formula (23) states that 2/p (p being the number of
+    features) is multiplied by Trace(cov*cov) in both the numerator and
+    denominator, but this operation is omitted because for a large p, the value
+    of 2/p is so small that it doesn't affect the value of the estimator.
+
+    References
+    ----------
+    .. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.",
+           Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
+           IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
+           <0907.4698>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import oas
+    >>> rng = np.random.RandomState(0)
+    >>> real_cov = [[.8, .3], [.3, .4]]
+    >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500)
+    >>> shrunk_cov, shrinkage = oas(X)
+    >>> shrunk_cov
+    array([[0.7533..., 0.2763...],
+           [0.2763..., 0.3964...]])
+    >>> shrinkage
+    0.0195...
+    """
+    estimator = OAS(
+        assume_centered=assume_centered,
+    ).fit(X)
+    return estimator.covariance_, estimator.shrinkage_
+
+
+class OAS(EmpiricalCovariance):
+    """Oracle Approximating Shrinkage Estimator as proposed in [1]_.
+
+    Read more in the :ref:`User Guide <shrunk_covariance>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, data will not be centered before computation.
+        Useful when working with data whose mean is almost, but not exactly
+        zero.
+        If False (default), data will be centered before computation.
+
+    Attributes
+    ----------
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated covariance matrix.
+
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    shrinkage_ : float
+      coefficient in the convex combination used for the computation
+      of the shrunk estimate. Range is [0, 1].
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    EllipticEnvelope : An object for detecting outliers in
+        a Gaussian distributed dataset.
+    EmpiricalCovariance : Maximum likelihood covariance estimator.
+    GraphicalLasso : Sparse inverse covariance estimation
+        with an l1-penalized estimator.
+    GraphicalLassoCV : Sparse inverse covariance with cross-validated
+        choice of the l1 penalty.
+    LedoitWolf : LedoitWolf Estimator.
+    MinCovDet : Minimum Covariance Determinant
+        (robust estimator of covariance).
+    ShrunkCovariance : Covariance estimator with shrinkage.
+
+    Notes
+    -----
+    The regularised covariance is:
+
+    (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
+
+    where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
+    (see [1]_).
+
+    The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
+    the original article, formula (23) states that 2/p (p being the number of
+    features) is multiplied by Trace(cov*cov) in both the numerator and
+    denominator, but this operation is omitted because for a large p, the value
+    of 2/p is so small that it doesn't affect the value of the estimator.
+
+    References
+    ----------
+    .. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.",
+           Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
+           IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
+           <0907.4698>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import OAS
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.multivariate_normal(mean=[0, 0],
+    ...                             cov=real_cov,
+    ...                             size=500)
+    >>> oas = OAS().fit(X)
+    >>> oas.covariance_
+    array([[0.7533..., 0.2763...],
+           [0.2763..., 0.3964...]])
+    >>> oas.precision_
+    array([[ 1.7833..., -1.2431... ],
+           [-1.2431...,  3.3889...]])
+    >>> oas.shrinkage_
+    0.0195...
+    """
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the Oracle Approximating Shrinkage covariance model to X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = self._validate_data(X)
+        # Not calling the parent object to fit, to avoid computing the
+        # covariance matrix (and potentially the precision)
+        if self.assume_centered:
+            self.location_ = np.zeros(X.shape[1])
+        else:
+            self.location_ = X.mean(0)
+
+        covariance, shrinkage = _oas(X - self.location_, assume_centered=True)
+        self.shrinkage_ = shrinkage
+        self._set_covariance(covariance)
+
+        return self
@@ -0,0 +1,377 @@
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>
+#         Virgile Fritsch <virgile.fritsch@inria.fr>
+#
+# License: BSD 3 clause
+
+import numpy as np
+import pytest
+
+from sklearn import datasets
+from sklearn.covariance import (
+    OAS,
+    EmpiricalCovariance,
+    LedoitWolf,
+    ShrunkCovariance,
+    empirical_covariance,
+    ledoit_wolf,
+    ledoit_wolf_shrinkage,
+    oas,
+    shrunk_covariance,
+)
+from sklearn.covariance._shrunk_covariance import _ledoit_wolf
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+from .._shrunk_covariance import _oas
+
+X, _ = datasets.load_diabetes(return_X_y=True)
+X_1d = X[:, 0]
+n_samples, n_features = X.shape
+
+
+def test_covariance():
+    # Tests Covariance module on a simple dataset.
+    # test covariance fit from data
+    cov = EmpiricalCovariance()
+    cov.fit(X)
+    emp_cov = empirical_covariance(X)
+    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
+    assert_almost_equal(cov.error_norm(emp_cov), 0)
+    assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0)
+    assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0)
+    assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
+    assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
+    with pytest.raises(NotImplementedError):
+        cov.error_norm(emp_cov, norm="foo")
+    # Mahalanobis distances computation test
+    mahal_dist = cov.mahalanobis(X)
+    assert np.amin(mahal_dist) > 0
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    cov = EmpiricalCovariance()
+    cov.fit(X_1d)
+    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
+    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
+    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0)
+
+    # test with one sample
+    # Create X with 1 sample and 5 features
+    X_1sample = np.arange(5).reshape(1, 5)
+    cov = EmpiricalCovariance()
+    warn_msg = "Only one sample available. You may want to reshape your data array"
+    with pytest.warns(UserWarning, match=warn_msg):
+        cov.fit(X_1sample)
+
+    assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
+
+    # test integer type
+    X_integer = np.asarray([[0, 1], [1, 0]])
+    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
+    assert_array_almost_equal(empirical_covariance(X_integer), result)
+
+    # test centered case
+    cov = EmpiricalCovariance(assume_centered=True)
+    cov.fit(X)
+    assert_array_equal(cov.location_, np.zeros(X.shape[1]))
+
+
+@pytest.mark.parametrize("n_matrices", [1, 3])
+def test_shrunk_covariance_func(n_matrices):
+    """Check `shrunk_covariance` function."""
+
+    n_features = 2
+    cov = np.ones((n_features, n_features))
+    cov_target = np.array([[1, 0.5], [0.5, 1]])
+
+    if n_matrices > 1:
+        cov = np.repeat(cov[np.newaxis, ...], n_matrices, axis=0)
+        cov_target = np.repeat(cov_target[np.newaxis, ...], n_matrices, axis=0)
+
+    cov_shrunk = shrunk_covariance(cov, 0.5)
+    assert_allclose(cov_shrunk, cov_target)
+
+
+def test_shrunk_covariance():
+    """Check consistency between `ShrunkCovariance` and `shrunk_covariance`."""
+
+    # Tests ShrunkCovariance module on a simple dataset.
+    # compare shrunk covariance obtained from data and from MLE estimate
+    cov = ShrunkCovariance(shrinkage=0.5)
+    cov.fit(X)
+    assert_array_almost_equal(
+        shrunk_covariance(empirical_covariance(X), shrinkage=0.5), cov.covariance_, 4
+    )
+
+    # same test with shrinkage not provided
+    cov = ShrunkCovariance()
+    cov.fit(X)
+    assert_array_almost_equal(
+        shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4
+    )
+
+    # same test with shrinkage = 0 (<==> empirical_covariance)
+    cov = ShrunkCovariance(shrinkage=0.0)
+    cov.fit(X)
+    assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    cov = ShrunkCovariance(shrinkage=0.3)
+    cov.fit(X_1d)
+    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    cov = ShrunkCovariance(shrinkage=0.5, store_precision=False)
+    cov.fit(X)
+    assert cov.precision_ is None
+
+
+def test_ledoit_wolf():
+    # Tests LedoitWolf module on a simple dataset.
+    # test shrinkage coeff on a simple data set
+    X_centered = X - X.mean(axis=0)
+    lw = LedoitWolf(assume_centered=True)
+    lw.fit(X_centered)
+    shrinkage_ = lw.shrinkage_
+
+    score_ = lw.score(X_centered)
+    assert_almost_equal(
+        ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_
+    )
+    assert_almost_equal(
+        ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6),
+        shrinkage_,
+    )
+    # compare shrunk covariance obtained from data and from MLE estimate
+    lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(
+        X_centered, assume_centered=True
+    )
+    assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
+    assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
+    # compare estimates given by LW and ShrunkCovariance
+    scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True)
+    scov.fit(X_centered)
+    assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    lw = LedoitWolf(assume_centered=True)
+    lw.fit(X_1d)
+    lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True)
+    assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
+    assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
+    assert_array_almost_equal((X_1d**2).sum() / n_samples, lw.covariance_, 4)
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    lw = LedoitWolf(store_precision=False, assume_centered=True)
+    lw.fit(X_centered)
+    assert_almost_equal(lw.score(X_centered), score_, 4)
+    assert lw.precision_ is None
+
+    # Same tests without assuming centered data
+    # test shrinkage coeff on a simple data set
+    lw = LedoitWolf()
+    lw.fit(X)
+    assert_almost_equal(lw.shrinkage_, shrinkage_, 4)
+    assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X))
+    assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1])
+    assert_almost_equal(
+        lw.shrinkage_, _ledoit_wolf(X=X, assume_centered=False, block_size=10000)[1]
+    )
+    assert_almost_equal(lw.score(X), score_, 4)
+    # compare shrunk covariance obtained from data and from MLE estimate
+    lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X)
+    assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
+    assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
+    # compare estimates given by LW and ShrunkCovariance
+    scov = ShrunkCovariance(shrinkage=lw.shrinkage_)
+    scov.fit(X)
+    assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    lw = LedoitWolf()
+    lw.fit(X_1d)
+    assert_allclose(
+        X_1d.var(ddof=0),
+        _ledoit_wolf(X=X_1d, assume_centered=False, block_size=10000)[0],
+    )
+    lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d)
+    assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
+    assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
+    assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4)
+
+    # test with one sample
+    # warning should be raised when using only 1 sample
+    X_1sample = np.arange(5).reshape(1, 5)
+    lw = LedoitWolf()
+
+    warn_msg = "Only one sample available. You may want to reshape your data array"
+    with pytest.warns(UserWarning, match=warn_msg):
+        lw.fit(X_1sample)
+
+    assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    lw = LedoitWolf(store_precision=False)
+    lw.fit(X)
+    assert_almost_equal(lw.score(X), score_, 4)
+    assert lw.precision_ is None
+
+
+def _naive_ledoit_wolf_shrinkage(X):
+    # A simple implementation of the formulas from Ledoit & Wolf
+
+    # The computation below achieves the following computations of the
+    # "O. Ledoit and M. Wolf, A Well-Conditioned Estimator for
+    # Large-Dimensional Covariance Matrices"
+    # beta and delta are given in the beginning of section 3.2
+    n_samples, n_features = X.shape
+    emp_cov = empirical_covariance(X, assume_centered=False)
+    mu = np.trace(emp_cov) / n_features
+    delta_ = emp_cov.copy()
+    delta_.flat[:: n_features + 1] -= mu
+    delta = (delta_**2).sum() / n_features
+    X2 = X**2
+    beta_ = (
+        1.0
+        / (n_features * n_samples)
+        * np.sum(np.dot(X2.T, X2) / n_samples - emp_cov**2)
+    )
+
+    beta = min(beta_, delta)
+    shrinkage = beta / delta
+    return shrinkage
+
+
+def test_ledoit_wolf_small():
+    # Compare our blocked implementation to the naive implementation
+    X_small = X[:, :4]
+    lw = LedoitWolf()
+    lw.fit(X_small)
+    shrinkage_ = lw.shrinkage_
+
+    assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small))
+
+
+def test_ledoit_wolf_large():
+    # test that ledoit_wolf doesn't error on data that is wider than block_size
+    rng = np.random.RandomState(0)
+    # use a number of features that is larger than the block-size
+    X = rng.normal(size=(10, 20))
+    lw = LedoitWolf(block_size=10).fit(X)
+    # check that covariance is about diagonal (random normal noise)
+    assert_almost_equal(lw.covariance_, np.eye(20), 0)
+    cov = lw.covariance_
+
+    # check that the result is consistent with not splitting data into blocks.
+    lw = LedoitWolf(block_size=25).fit(X)
+    assert_almost_equal(lw.covariance_, cov)
+
+
+@pytest.mark.parametrize(
+    "ledoit_wolf_fitting_function", [LedoitWolf().fit, ledoit_wolf_shrinkage]
+)
+def test_ledoit_wolf_empty_array(ledoit_wolf_fitting_function):
+    """Check that we validate X and raise proper error with 0-sample array."""
+    X_empty = np.zeros((0, 2))
+    with pytest.raises(ValueError, match="Found array with 0 sample"):
+        ledoit_wolf_fitting_function(X_empty)
+
+
+def test_oas():
+    # Tests OAS module on a simple dataset.
+    # test shrinkage coeff on a simple data set
+    X_centered = X - X.mean(axis=0)
+    oa = OAS(assume_centered=True)
+    oa.fit(X_centered)
+    shrinkage_ = oa.shrinkage_
+    score_ = oa.score(X_centered)
+    # compare shrunk covariance obtained from data and from MLE estimate
+    oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_centered, assume_centered=True)
+    assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
+    # compare estimates given by OAS and ShrunkCovariance
+    scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True)
+    scov.fit(X_centered)
+    assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0:1]
+    oa = OAS(assume_centered=True)
+    oa.fit(X_1d)
+    oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d, assume_centered=True)
+    assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
+    assert_array_almost_equal((X_1d**2).sum() / n_samples, oa.covariance_, 4)
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    oa = OAS(store_precision=False, assume_centered=True)
+    oa.fit(X_centered)
+    assert_almost_equal(oa.score(X_centered), score_, 4)
+    assert oa.precision_ is None
+
+    # Same tests without assuming centered data--------------------------------
+    # test shrinkage coeff on a simple data set
+    oa = OAS()
+    oa.fit(X)
+    assert_almost_equal(oa.shrinkage_, shrinkage_, 4)
+    assert_almost_equal(oa.score(X), score_, 4)
+    # compare shrunk covariance obtained from data and from MLE estimate
+    oa_cov_from_mle, oa_shrinkage_from_mle = oas(X)
+    assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
+    # compare estimates given by OAS and ShrunkCovariance
+    scov = ShrunkCovariance(shrinkage=oa.shrinkage_)
+    scov.fit(X)
+    assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
+
+    # test with n_features = 1
+    X_1d = X[:, 0].reshape((-1, 1))
+    oa = OAS()
+    oa.fit(X_1d)
+    oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d)
+    assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
+    assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4)
+
+    # test with one sample
+    # warning should be raised when using only 1 sample
+    X_1sample = np.arange(5).reshape(1, 5)
+    oa = OAS()
+    warn_msg = "Only one sample available. You may want to reshape your data array"
+    with pytest.warns(UserWarning, match=warn_msg):
+        oa.fit(X_1sample)
+
+    assert_array_almost_equal(oa.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
+
+    # test shrinkage coeff on a simple data set (without saving precision)
+    oa = OAS(store_precision=False)
+    oa.fit(X)
+    assert_almost_equal(oa.score(X), score_, 4)
+    assert oa.precision_ is None
+
+    # test function _oas without assuming centered data
+    X_1f = X[:, 0:1]
+    oa = OAS()
+    oa.fit(X_1f)
+    # compare shrunk covariance obtained from data and from MLE estimate
+    _oa_cov_from_mle, _oa_shrinkage_from_mle = _oas(X_1f)
+    assert_array_almost_equal(_oa_cov_from_mle, oa.covariance_, 4)
+    assert_almost_equal(_oa_shrinkage_from_mle, oa.shrinkage_)
+    assert_array_almost_equal((X_1f**2).sum() / n_samples, oa.covariance_, 4)
+
+
+def test_EmpiricalCovariance_validates_mahalanobis():
+    """Checks that EmpiricalCovariance validates data with mahalanobis."""
+    cov = EmpiricalCovariance().fit(X)
+
+    msg = f"X has 2 features, but \\w+ is expecting {X.shape[1]} features as input"
+    with pytest.raises(ValueError, match=msg):
+        cov.mahalanobis(X[:, :2])
@@ -0,0 +1,52 @@
+"""
+Testing for Elliptic Envelope algorithm (sklearn.covariance.elliptic_envelope).
+"""
+
+import numpy as np
+import pytest
+
+from sklearn.covariance import EllipticEnvelope
+from sklearn.exceptions import NotFittedError
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+
+def test_elliptic_envelope(global_random_seed):
+    rnd = np.random.RandomState(global_random_seed)
+    X = rnd.randn(100, 10)
+    clf = EllipticEnvelope(contamination=0.1)
+    with pytest.raises(NotFittedError):
+        clf.predict(X)
+    with pytest.raises(NotFittedError):
+        clf.decision_function(X)
+    clf.fit(X)
+    y_pred = clf.predict(X)
+    scores = clf.score_samples(X)
+    decisions = clf.decision_function(X)
+
+    assert_array_almost_equal(scores, -clf.mahalanobis(X))
+    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
+    assert_almost_equal(
+        clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0
+    )
+    assert sum(y_pred == -1) == sum(decisions < 0)
+
+
+def test_score_samples():
+    X_train = [[1, 1], [1, 2], [2, 1]]
+    clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
+    clf2 = EllipticEnvelope().fit(X_train)
+    assert_array_equal(
+        clf1.score_samples([[2.0, 2.0]]),
+        clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
+    )
+    assert_array_equal(
+        clf2.score_samples([[2.0, 2.0]]),
+        clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
+    )
+    assert_array_equal(
+        clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
+    )
@@ -0,0 +1,318 @@
+"""Test the graphical_lasso module."""
+
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy import linalg
+
+from sklearn import datasets
+from sklearn.covariance import (
+    GraphicalLasso,
+    GraphicalLassoCV,
+    empirical_covariance,
+    graphical_lasso,
+)
+from sklearn.datasets import make_sparse_spd_matrix
+from sklearn.model_selection import GroupKFold
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_array_almost_equal,
+    assert_array_less,
+)
+
+
+def test_graphical_lassos(random_state=1):
+    """Test the graphical lasso solvers.
+
+    This checks is unstable for some random seeds where the covariance found with "cd"
+    and "lars" solvers are different (4 cases / 100 tries).
+    """
+    # Sample data from a sparse multivariate normal
+    dim = 20
+    n_samples = 100
+    random_state = check_random_state(random_state)
+    prec = make_sparse_spd_matrix(dim, alpha=0.95, random_state=random_state)
+    cov = linalg.inv(prec)
+    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
+    emp_cov = empirical_covariance(X)
+
+    for alpha in (0.0, 0.1, 0.25):
+        covs = dict()
+        icovs = dict()
+        for method in ("cd", "lars"):
+            cov_, icov_, costs = graphical_lasso(
+                emp_cov, return_costs=True, alpha=alpha, mode=method
+            )
+            covs[method] = cov_
+            icovs[method] = icov_
+            costs, dual_gap = np.array(costs).T
+            # Check that the costs always decrease (doesn't hold if alpha == 0)
+            if not alpha == 0:
+                # use 1e-12 since the cost can be exactly 0
+                assert_array_less(np.diff(costs), 1e-12)
+        # Check that the 2 approaches give similar results
+        assert_allclose(covs["cd"], covs["lars"], atol=5e-4)
+        assert_allclose(icovs["cd"], icovs["lars"], atol=5e-4)
+
+    # Smoke test the estimator
+    model = GraphicalLasso(alpha=0.25).fit(X)
+    model.score(X)
+    assert_array_almost_equal(model.covariance_, covs["cd"], decimal=4)
+    assert_array_almost_equal(model.covariance_, covs["lars"], decimal=4)
+
+    # For a centered matrix, assume_centered could be chosen True or False
+    # Check that this returns indeed the same result for centered data
+    Z = X - X.mean(0)
+    precs = list()
+    for assume_centered in (False, True):
+        prec_ = GraphicalLasso(assume_centered=assume_centered).fit(Z).precision_
+        precs.append(prec_)
+    assert_array_almost_equal(precs[0], precs[1])
+
+
+def test_graphical_lasso_when_alpha_equals_0():
+    """Test graphical_lasso's early return condition when alpha=0."""
+    X = np.random.randn(100, 10)
+    emp_cov = empirical_covariance(X, assume_centered=True)
+
+    model = GraphicalLasso(alpha=0, covariance="precomputed").fit(emp_cov)
+    assert_allclose(model.precision_, np.linalg.inv(emp_cov))
+
+    _, precision = graphical_lasso(emp_cov, alpha=0)
+    assert_allclose(precision, np.linalg.inv(emp_cov))
+
+
+@pytest.mark.parametrize("mode", ["cd", "lars"])
+def test_graphical_lasso_n_iter(mode):
+    X, _ = datasets.make_classification(n_samples=5_000, n_features=20, random_state=0)
+    emp_cov = empirical_covariance(X)
+
+    _, _, n_iter = graphical_lasso(
+        emp_cov, 0.2, mode=mode, max_iter=2, return_n_iter=True
+    )
+    assert n_iter == 2
+
+
+def test_graphical_lasso_iris():
+    # Hard-coded solution from R glasso package for alpha=1.0
+    # (need to set penalize.diagonal to FALSE)
+    cov_R = np.array(
+        [
+            [0.68112222, 0.0000000, 0.265820, 0.02464314],
+            [0.00000000, 0.1887129, 0.000000, 0.00000000],
+            [0.26582000, 0.0000000, 3.095503, 0.28697200],
+            [0.02464314, 0.0000000, 0.286972, 0.57713289],
+        ]
+    )
+    icov_R = np.array(
+        [
+            [1.5190747, 0.000000, -0.1304475, 0.0000000],
+            [0.0000000, 5.299055, 0.0000000, 0.0000000],
+            [-0.1304475, 0.000000, 0.3498624, -0.1683946],
+            [0.0000000, 0.000000, -0.1683946, 1.8164353],
+        ]
+    )
+    X = datasets.load_iris().data
+    emp_cov = empirical_covariance(X)
+    for method in ("cd", "lars"):
+        cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method)
+        assert_array_almost_equal(cov, cov_R)
+        assert_array_almost_equal(icov, icov_R)
+
+
+def test_graph_lasso_2D():
+    # Hard-coded solution from Python skggm package
+    # obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)`
+    cov_skggm = np.array([[3.09550269, 1.186972], [1.186972, 0.57713289]])
+
+    icov_skggm = np.array([[1.52836773, -3.14334831], [-3.14334831, 8.19753385]])
+    X = datasets.load_iris().data[:, 2:]
+    emp_cov = empirical_covariance(X)
+    for method in ("cd", "lars"):
+        cov, icov = graphical_lasso(emp_cov, alpha=0.1, return_costs=False, mode=method)
+        assert_array_almost_equal(cov, cov_skggm)
+        assert_array_almost_equal(icov, icov_skggm)
+
+
+def test_graphical_lasso_iris_singular():
+    # Small subset of rows to test the rank-deficient case
+    # Need to choose samples such that none of the variances are zero
+    indices = np.arange(10, 13)
+
+    # Hard-coded solution from R glasso package for alpha=0.01
+    cov_R = np.array(
+        [
+            [0.08, 0.056666662595, 0.00229729713223, 0.00153153142149],
+            [0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222],
+            [0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009],
+            [0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222],
+        ]
+    )
+    icov_R = np.array(
+        [
+            [24.42244057, -16.831679593, 0.0, 0.0],
+            [-16.83168201, 24.351841681, -6.206896552, -12.5],
+            [0.0, -6.206896171, 153.103448276, 0.0],
+            [0.0, -12.499999143, 0.0, 462.5],
+        ]
+    )
+    X = datasets.load_iris().data[indices, :]
+    emp_cov = empirical_covariance(X)
+    for method in ("cd", "lars"):
+        cov, icov = graphical_lasso(
+            emp_cov, alpha=0.01, return_costs=False, mode=method
+        )
+        assert_array_almost_equal(cov, cov_R, decimal=5)
+        assert_array_almost_equal(icov, icov_R, decimal=5)
+
+
+def test_graphical_lasso_cv(random_state=1):
+    # Sample data from a sparse multivariate normal
+    dim = 5
+    n_samples = 6
+    random_state = check_random_state(random_state)
+    prec = make_sparse_spd_matrix(dim, alpha=0.96, random_state=random_state)
+    cov = linalg.inv(prec)
+    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
+    # Capture stdout, to smoke test the verbose mode
+    orig_stdout = sys.stdout
+    try:
+        sys.stdout = StringIO()
+        # We need verbose very high so that Parallel prints on stdout
+        GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X)
+    finally:
+        sys.stdout = orig_stdout
+
+
+@pytest.mark.parametrize("alphas_container_type", ["list", "tuple", "array"])
+def test_graphical_lasso_cv_alphas_iterable(alphas_container_type):
+    """Check that we can pass an array-like to `alphas`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/22489
+    """
+    true_cov = np.array(
+        [
+            [0.8, 0.0, 0.2, 0.0],
+            [0.0, 0.4, 0.0, 0.0],
+            [0.2, 0.0, 0.3, 0.1],
+            [0.0, 0.0, 0.1, 0.7],
+        ]
+    )
+    rng = np.random.RandomState(0)
+    X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
+    alphas = _convert_container([0.02, 0.03], alphas_container_type)
+    GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
+
+
+@pytest.mark.parametrize(
+    "alphas,err_type,err_msg",
+    [
+        ([-0.02, 0.03], ValueError, "must be > 0"),
+        ([0, 0.03], ValueError, "must be > 0"),
+        (["not_number", 0.03], TypeError, "must be an instance of float"),
+    ],
+)
+def test_graphical_lasso_cv_alphas_invalid_array(alphas, err_type, err_msg):
+    """Check that if an array-like containing a value
+    outside of (0, inf] is passed to `alphas`, a ValueError is raised.
+    Check if a string is passed, a TypeError is raised.
+    """
+    true_cov = np.array(
+        [
+            [0.8, 0.0, 0.2, 0.0],
+            [0.0, 0.4, 0.0, 0.0],
+            [0.2, 0.0, 0.3, 0.1],
+            [0.0, 0.0, 0.1, 0.7],
+        ]
+    )
+    rng = np.random.RandomState(0)
+    X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
+
+    with pytest.raises(err_type, match=err_msg):
+        GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
+
+
+def test_graphical_lasso_cv_scores():
+    splits = 4
+    n_alphas = 5
+    n_refinements = 3
+    true_cov = np.array(
+        [
+            [0.8, 0.0, 0.2, 0.0],
+            [0.0, 0.4, 0.0, 0.0],
+            [0.2, 0.0, 0.3, 0.1],
+            [0.0, 0.0, 0.1, 0.7],
+        ]
+    )
+    rng = np.random.RandomState(0)
+    X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
+    cov = GraphicalLassoCV(cv=splits, alphas=n_alphas, n_refinements=n_refinements).fit(
+        X
+    )
+
+    _assert_graphical_lasso_cv_scores(
+        cov=cov,
+        n_splits=splits,
+        n_refinements=n_refinements,
+        n_alphas=n_alphas,
+    )
+
+
+@pytest.mark.usefixtures("enable_slep006")
+def test_graphical_lasso_cv_scores_with_routing(global_random_seed):
+    """Check that `GraphicalLassoCV` internally dispatches metadata to
+    the splitter.
+    """
+    splits = 5
+    n_alphas = 5
+    n_refinements = 3
+    true_cov = np.array(
+        [
+            [0.8, 0.0, 0.2, 0.0],
+            [0.0, 0.4, 0.0, 0.0],
+            [0.2, 0.0, 0.3, 0.1],
+            [0.0, 0.0, 0.1, 0.7],
+        ]
+    )
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=300)
+    n_samples = X.shape[0]
+    groups = rng.randint(0, 5, n_samples)
+    params = {"groups": groups}
+    cv = GroupKFold(n_splits=splits)
+    cv.set_split_request(groups=True)
+
+    cov = GraphicalLassoCV(cv=cv, alphas=n_alphas, n_refinements=n_refinements).fit(
+        X, **params
+    )
+
+    _assert_graphical_lasso_cv_scores(
+        cov=cov,
+        n_splits=splits,
+        n_refinements=n_refinements,
+        n_alphas=n_alphas,
+    )
+
+
+def _assert_graphical_lasso_cv_scores(cov, n_splits, n_refinements, n_alphas):
+    cv_results = cov.cv_results_
+    # alpha and one for each split
+
+    total_alphas = n_refinements * n_alphas + 1
+    keys = ["alphas"]
+    split_keys = [f"split{i}_test_score" for i in range(n_splits)]
+    for key in keys + split_keys:
+        assert key in cv_results
+        assert len(cv_results[key]) == total_alphas
+
+    cv_scores = np.asarray([cov.cv_results_[key] for key in split_keys])
+    expected_mean = cv_scores.mean(axis=0)
+    expected_std = cv_scores.std(axis=0)
+
+    assert_allclose(cov.cv_results_["mean_test_score"], expected_mean)
+    assert_allclose(cov.cv_results_["std_test_score"], expected_std)
@@ -0,0 +1,171 @@
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>
+#         Virgile Fritsch <virgile.fritsch@inria.fr>
+#
+# License: BSD 3 clause
+
+import itertools
+
+import numpy as np
+import pytest
+
+from sklearn import datasets
+from sklearn.covariance import MinCovDet, empirical_covariance, fast_mcd
+from sklearn.utils._testing import assert_array_almost_equal
+
+X = datasets.load_iris().data
+X_1d = X[:, 0]
+n_samples, n_features = X.shape
+
+
+def test_mcd(global_random_seed):
+    # Tests the FastMCD algorithm implementation
+    # Small data set
+    # test without outliers (random independent normal data)
+    launch_mcd_on_dataset(100, 5, 0, 0.02, 0.1, 75, global_random_seed)
+    # test with a contaminated data set (medium contamination)
+    launch_mcd_on_dataset(100, 5, 20, 0.3, 0.3, 65, global_random_seed)
+    # test with a contaminated data set (strong contamination)
+    launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50, global_random_seed)
+
+    # Medium data set
+    launch_mcd_on_dataset(1000, 5, 450, 0.1, 0.1, 540, global_random_seed)
+
+    # Large data set
+    launch_mcd_on_dataset(1700, 5, 800, 0.1, 0.1, 870, global_random_seed)
+
+    # 1D data set
+    launch_mcd_on_dataset(500, 1, 100, 0.02, 0.02, 350, global_random_seed)
+
+
+def test_fast_mcd_on_invalid_input():
+    X = np.arange(100)
+    msg = "Expected 2D array, got 1D array instead"
+    with pytest.raises(ValueError, match=msg):
+        fast_mcd(X)
+
+
+def test_mcd_class_on_invalid_input():
+    X = np.arange(100)
+    mcd = MinCovDet()
+    msg = "Expected 2D array, got 1D array instead"
+    with pytest.raises(ValueError, match=msg):
+        mcd.fit(X)
+
+
+def launch_mcd_on_dataset(
+    n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support, seed
+):
+    rand_gen = np.random.RandomState(seed)
+    data = rand_gen.randn(n_samples, n_features)
+    # add some outliers
+    outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
+    outliers_offset = 10.0 * (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
+    data[outliers_index] += outliers_offset
+    inliers_mask = np.ones(n_samples).astype(bool)
+    inliers_mask[outliers_index] = False
+
+    pure_data = data[inliers_mask]
+    # compute MCD by fitting an object
+    mcd_fit = MinCovDet(random_state=seed).fit(data)
+    T = mcd_fit.location_
+    S = mcd_fit.covariance_
+    H = mcd_fit.support_
+    # compare with the estimates learnt from the inliers
+    error_location = np.mean((pure_data.mean(0) - T) ** 2)
+    assert error_location < tol_loc
+    error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
+    assert error_cov < tol_cov
+    assert np.sum(H) >= tol_support
+    assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
+
+
+def test_mcd_issue1127():
+    # Check that the code does not break with X.shape = (3, 1)
+    # (i.e. n_support = n_samples)
+    rnd = np.random.RandomState(0)
+    X = rnd.normal(size=(3, 1))
+    mcd = MinCovDet()
+    mcd.fit(X)
+
+
+def test_mcd_issue3367(global_random_seed):
+    # Check that MCD completes when the covariance matrix is singular
+    # i.e. one of the rows and columns are all zeros
+    rand_gen = np.random.RandomState(global_random_seed)
+
+    # Think of these as the values for X and Y -> 10 values between -5 and 5
+    data_values = np.linspace(-5, 5, 10).tolist()
+    # Get the cartesian product of all possible coordinate pairs from above set
+    data = np.array(list(itertools.product(data_values, data_values)))
+
+    # Add a third column that's all zeros to make our data a set of point
+    # within a plane, which means that the covariance matrix will be singular
+    data = np.hstack((data, np.zeros((data.shape[0], 1))))
+
+    # The below line of code should raise an exception if the covariance matrix
+    # is singular. As a further test, since we have points in XYZ, the
+    # principle components (Eigenvectors) of these directly relate to the
+    # geometry of the points. Since it's a plane, we should be able to test
+    # that the Eigenvector that corresponds to the smallest Eigenvalue is the
+    # plane normal, specifically [0, 0, 1], since everything is in the XY plane
+    # (as I've set it up above). To do this one would start by:
+    #
+    #     evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
+    #     normal = evecs[:, np.argmin(evals)]
+    #
+    # After which we need to assert that our `normal` is equal to [0, 0, 1].
+    # Do note that there is floating point error associated with this, so it's
+    # best to subtract the two and then compare some small tolerance (e.g.
+    # 1e-12).
+    MinCovDet(random_state=rand_gen).fit(data)
+
+
+def test_mcd_support_covariance_is_zero():
+    # Check that MCD returns a ValueError with informative message when the
+    # covariance of the support data is equal to 0.
+    X_1 = np.array([0.5, 0.1, 0.1, 0.1, 0.957, 0.1, 0.1, 0.1, 0.4285, 0.1])
+    X_1 = X_1.reshape(-1, 1)
+    X_2 = np.array([0.5, 0.3, 0.3, 0.3, 0.957, 0.3, 0.3, 0.3, 0.4285, 0.3])
+    X_2 = X_2.reshape(-1, 1)
+    msg = (
+        "The covariance matrix of the support data is equal to 0, try to "
+        "increase support_fraction"
+    )
+    for X in [X_1, X_2]:
+        with pytest.raises(ValueError, match=msg):
+            MinCovDet().fit(X)
+
+
+def test_mcd_increasing_det_warning(global_random_seed):
+    # Check that a warning is raised if we observe increasing determinants
+    # during the c_step. In theory the sequence of determinants should be
+    # decreasing. Increasing determinants are likely due to ill-conditioned
+    # covariance matrices that result in poor precision matrices.
+
+    X = [
+        [5.1, 3.5, 1.4, 0.2],
+        [4.9, 3.0, 1.4, 0.2],
+        [4.7, 3.2, 1.3, 0.2],
+        [4.6, 3.1, 1.5, 0.2],
+        [5.0, 3.6, 1.4, 0.2],
+        [4.6, 3.4, 1.4, 0.3],
+        [5.0, 3.4, 1.5, 0.2],
+        [4.4, 2.9, 1.4, 0.2],
+        [4.9, 3.1, 1.5, 0.1],
+        [5.4, 3.7, 1.5, 0.2],
+        [4.8, 3.4, 1.6, 0.2],
+        [4.8, 3.0, 1.4, 0.1],
+        [4.3, 3.0, 1.1, 0.1],
+        [5.1, 3.5, 1.4, 0.3],
+        [5.7, 3.8, 1.7, 0.3],
+        [5.4, 3.4, 1.7, 0.2],
+        [4.6, 3.6, 1.0, 0.2],
+        [5.0, 3.0, 1.6, 0.2],
+        [5.2, 3.5, 1.5, 0.2],
+    ]
+
+    mcd = MinCovDet(support_fraction=0.5, random_state=global_random_seed)
+    warn_msg = "Determinant has increased"
+    with pytest.warns(RuntimeWarning, match=warn_msg):
+        mcd.fit(X)
--- a/Show More
+++ b/Show More