feat: initial commit - Phase 1 & 2 core features

This commit is contained in:
hiderfong
2026-04-22 17:07:33 +08:00
commit 1773bda06b
25005 changed files with 6252106 additions and 0 deletions
@@ -0,0 +1,51 @@
"""Module to give helpful messages to the user that did not
compile scikit-learn properly.
"""
import os
INPLACE_MSG = """
It appears that you are importing a local scikit-learn source tree. For
this, you need to have an inplace install. Maybe you are in the source
directory and you need to try from another location."""
STANDARD_MSG = """
If you have used an installer, please check that it is suited for your
Python version, your operating system and your platform."""
def raise_build_error(e):
# Raise a comprehensible error and list the contents of the
# directory to help debugging on the mailing list.
local_dir = os.path.split(__file__)[0]
msg = STANDARD_MSG
if local_dir == "sklearn/__check_build":
# Picking up the local install: this will work only if the
# install is an 'inplace build'
msg = INPLACE_MSG
dir_content = list()
for i, filename in enumerate(os.listdir(local_dir)):
if (i + 1) % 3:
dir_content.append(filename.ljust(26))
else:
dir_content.append(filename + "\n")
raise ImportError(
"""%s
___________________________________________________________________________
Contents of %s:
%s
___________________________________________________________________________
It seems that scikit-learn has not been built correctly.
If you have installed scikit-learn from source, please do not forget
to build the package before using it: run `python setup.py install` or
`make` in the source directory.
%s"""
% (e, local_dir, "".join(dir_content).strip(), msg)
)
try:
from ._check_build import check_build # noqa
except ImportError as e:
raise_build_error(e)
@@ -0,0 +1,2 @@
def check_build():
return
@@ -0,0 +1,7 @@
py.extension_module(
'_check_build',
'_check_build.pyx',
cython_args: cython_args,
install: true,
subdir: 'sklearn/__check_build',
)
@@ -0,0 +1,172 @@
"""
The :mod:`sklearn` module includes functions to configure global settings and
get information about the working environment.
"""
# Machine learning module for Python
# ==================================
#
# sklearn is a Python module integrating classical machine
# learning algorithms in the tightly-knit world of scientific Python
# packages (numpy, scipy, matplotlib).
#
# It aims to provide simple and efficient solutions to learning problems
# that are accessible to everybody and reusable in various contexts:
# machine-learning as a versatile tool for science and engineering.
#
# See https://scikit-learn.org for complete documentation.
import logging
import os
import random
import sys
from ._config import config_context, get_config, set_config
logger = logging.getLogger(__name__)
# PEP0440 compatible formatted version, see:
# https://www.python.org/dev/peps/pep-0440/
#
# Generic release markers:
# X.Y.0 # For first release after an increment in Y
# X.Y.Z # For bugfix releases
#
# Admissible pre-release markers:
# X.Y.ZaN # Alpha release
# X.Y.ZbN # Beta release
# X.Y.ZrcN # Release Candidate
# X.Y.Z # Final release
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
#
__version__ = "1.5.0"
# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
# simultaneously. This can happen for instance when calling BLAS inside a
# prange. Setting the following environment variable allows multiple OpenMP
# libraries to be loaded. It should not degrade performances since we manually
# take care of potential over-subcription performance issues, in sections of
# the code where nested OpenMP loops can happen, by dynamically reconfiguring
# the inner OpenMP runtime to temporarily disable it while under the scope of
# the outer OpenMP parallel section.
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
# Workaround issue discovered in intel-openmp 2019.5:
# https://github.com/ContinuumIO/anaconda-issues/issues/11294
os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
try:
# This variable is injected in the __builtins__ by the build
# process. It is used to enable importing subpackages of sklearn when
# the binaries are not built
# mypy error: Cannot determine type of '__SKLEARN_SETUP__'
__SKLEARN_SETUP__ # type: ignore
except NameError:
__SKLEARN_SETUP__ = False
if __SKLEARN_SETUP__:
sys.stderr.write("Partial import of sklearn during the build process.\n")
# We are not importing the rest of scikit-learn during the build
# process, as it may not be compiled yet
else:
# Import numpy, scipy to make sure that the BLAS libs are loaded before
# creating the ThreadpoolController. They would be imported just after
# when importing utils anyway. This makes it explicit and robust to changes
# in utils.
# (OpenMP is loaded by importing show_versions right after this block)
import numpy # noqa
import scipy.linalg # noqa
from threadpoolctl import ThreadpoolController
# `_distributor_init` allows distributors to run custom init code.
# For instance, for the Windows wheel, this is used to pre-load the
# vcomp shared library runtime for OpenMP embedded in the sklearn/.libs
# sub-folder.
# It is necessary to do this prior to importing show_versions as the
# later is linked to the OpenMP runtime to make it possible to introspect
# it and importing it first would fail if the OpenMP dll cannot be found.
from . import (
__check_build, # noqa: F401
_distributor_init, # noqa: F401
)
from .base import clone
from .utils._show_versions import show_versions
__all__ = [
"calibration",
"cluster",
"covariance",
"cross_decomposition",
"datasets",
"decomposition",
"dummy",
"ensemble",
"exceptions",
"experimental",
"externals",
"feature_extraction",
"feature_selection",
"gaussian_process",
"inspection",
"isotonic",
"kernel_approximation",
"kernel_ridge",
"linear_model",
"manifold",
"metrics",
"mixture",
"model_selection",
"multiclass",
"multioutput",
"naive_bayes",
"neighbors",
"neural_network",
"pipeline",
"preprocessing",
"random_projection",
"semi_supervised",
"svm",
"tree",
"discriminant_analysis",
"impute",
"compose",
# Non-modules:
"clone",
"get_config",
"set_config",
"config_context",
"show_versions",
]
_BUILT_WITH_MESON = False
try:
import sklearn._built_with_meson # noqa: F401
_BUILT_WITH_MESON = True
except ModuleNotFoundError:
pass
# Set a global controller that can be used to locally limit the number of
# threads without looping through all shared libraries every time.
# This instantitation should not happen earlier because it needs all BLAS and
# OpenMP libs to be loaded first.
_threadpool_controller = ThreadpoolController()
def setup_module(module):
"""Fixture for the tests to assure globally controllable seeding of RNGs"""
import numpy as np
# Check if a random seed exists in the environment, if not create one.
_random_seed = os.environ.get("SKLEARN_SEED", None)
if _random_seed is None:
_random_seed = np.random.uniform() * np.iinfo(np.int32).max
_random_seed = int(_random_seed)
print("I: Seeding RNGs with %r" % _random_seed)
np.random.seed(_random_seed)
random.seed(_random_seed)
@@ -0,0 +1,116 @@
"""
Utilities useful during the build.
"""
# author: Andy Mueller, Gael Varoquaux
# license: BSD
import contextlib
import os
import sklearn
from .._min_dependencies import CYTHON_MIN_VERSION
from ..externals._packaging.version import parse
from .openmp_helpers import check_openmp_support
from .pre_build_helpers import basic_check_build
DEFAULT_ROOT = "sklearn"
def _check_cython_version():
message = (
"Please install Cython with a version >= {0} in order "
"to build a scikit-learn from source."
).format(CYTHON_MIN_VERSION)
try:
import Cython
except ModuleNotFoundError as e:
# Re-raise with more informative error message instead:
raise ModuleNotFoundError(message) from e
if parse(Cython.__version__) < parse(CYTHON_MIN_VERSION):
message += " The current version of Cython is {} installed in {}.".format(
Cython.__version__, Cython.__path__
)
raise ValueError(message)
def cythonize_extensions(extension):
"""Check that a recent Cython is available and cythonize extensions"""
_check_cython_version()
from Cython.Build import cythonize
# Fast fail before cythonization if compiler fails compiling basic test
# code even without OpenMP
basic_check_build()
# check simple compilation with OpenMP. If it fails scikit-learn will be
# built without OpenMP and the test test_openmp_supported in the test suite
# will fail.
# `check_openmp_support` compiles a small test program to see if the
# compilers are properly configured to build with OpenMP. This is expensive
# and we only want to call this function once.
# The result of this check is cached as a private attribute on the sklearn
# module (only at build-time) to be used in the build_ext subclass defined
# in the top-level setup.py file to actually build the compiled extensions
# with OpenMP flags if needed.
sklearn._OPENMP_SUPPORTED = check_openmp_support()
n_jobs = 1
with contextlib.suppress(ImportError):
import joblib
n_jobs = joblib.cpu_count()
# Additional checks for Cython
cython_enable_debug_directives = (
os.environ.get("SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES", "0") != "0"
)
compiler_directives = {
"language_level": 3,
"boundscheck": cython_enable_debug_directives,
"wraparound": False,
"initializedcheck": False,
"nonecheck": False,
"cdivision": True,
"profile": False,
}
return cythonize(
extension,
nthreads=n_jobs,
compiler_directives=compiler_directives,
annotate=False,
)
def gen_from_templates(templates):
"""Generate cython files from a list of templates"""
# Lazy import because cython is not a runtime dependency.
from Cython import Tempita
for template in templates:
outfile = template.replace(".tp", "")
# if the template is not updated, no need to output the cython file
if not (
os.path.exists(outfile)
and os.stat(template).st_mtime < os.stat(outfile).st_mtime
):
with open(template, "r") as f:
tmpl = f.read()
tmpl_ = Tempita.sub(tmpl)
warn_msg = (
"# WARNING: Do not edit this file directly.\n"
f"# It is automatically generated from {template!r}.\n"
"# Changes must be made there.\n\n"
)
with open(outfile, "w") as f:
f.write(warn_msg)
f.write(tmpl_)
@@ -0,0 +1,127 @@
"""Helpers for OpenMP support during the build."""
# This code is adapted for a large part from the astropy openmp helpers, which
# can be found at: https://github.com/astropy/extension-helpers/blob/master/extension_helpers/_openmp_helpers.py # noqa
import os
import sys
import textwrap
import warnings
from .pre_build_helpers import compile_test_program
def get_openmp_flag():
if sys.platform == "win32":
return ["/openmp"]
elif sys.platform == "darwin" and "openmp" in os.getenv("CPPFLAGS", ""):
# -fopenmp can't be passed as compile flag when using Apple-clang.
# OpenMP support has to be enabled during preprocessing.
#
# For example, our macOS wheel build jobs use the following environment
# variables to build with Apple-clang and the brew installed "libomp":
#
# export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"
# export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
# export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
# export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib
# -L/usr/local/opt/libomp/lib -lomp"
return []
# Default flag for GCC and clang:
return ["-fopenmp"]
def check_openmp_support():
"""Check whether OpenMP test code can be compiled and run"""
if "PYODIDE" in os.environ:
# Pyodide doesn't support OpenMP
return False
code = textwrap.dedent(
"""\
#include <omp.h>
#include <stdio.h>
int main(void) {
#pragma omp parallel
printf("nthreads=%d\\n", omp_get_num_threads());
return 0;
}
"""
)
extra_preargs = os.getenv("LDFLAGS", None)
if extra_preargs is not None:
extra_preargs = extra_preargs.strip().split(" ")
# FIXME: temporary fix to link against system libraries on linux
# "-Wl,--sysroot=/" should be removed
extra_preargs = [
flag
for flag in extra_preargs
if flag.startswith(("-L", "-Wl,-rpath", "-l", "-Wl,--sysroot=/"))
]
extra_postargs = get_openmp_flag()
openmp_exception = None
try:
output = compile_test_program(
code, extra_preargs=extra_preargs, extra_postargs=extra_postargs
)
if output and "nthreads=" in output[0]:
nthreads = int(output[0].strip().split("=")[1])
openmp_supported = len(output) == nthreads
elif "PYTHON_CROSSENV" in os.environ:
# Since we can't run the test program when cross-compiling
# assume that openmp is supported if the program can be
# compiled.
openmp_supported = True
else:
openmp_supported = False
except Exception as exception:
# We could be more specific and only catch: CompileError, LinkError,
# and subprocess.CalledProcessError.
# setuptools introduced CompileError and LinkError, but that requires
# version 61.1. Even the latest version of Ubuntu (22.04LTS) only
# ships with 59.6. So for now we catch all exceptions and reraise a
# generic exception with the original error message instead:
openmp_supported = False
openmp_exception = exception
if not openmp_supported:
if os.getenv("SKLEARN_FAIL_NO_OPENMP"):
raise Exception(
"Failed to build scikit-learn with OpenMP support"
) from openmp_exception
else:
message = textwrap.dedent(
"""
***********
* WARNING *
***********
It seems that scikit-learn cannot be built with OpenMP.
- Make sure you have followed the installation instructions:
https://scikit-learn.org/dev/developers/advanced_installation.html
- If your compiler supports OpenMP but you still see this
message, please submit a bug report at:
https://github.com/scikit-learn/scikit-learn/issues
- The build will continue with OpenMP-based parallelism
disabled. Note however that some estimators will run in
sequential mode instead of leveraging thread-based
parallelism.
***
"""
)
warnings.warn(message)
return openmp_supported
@@ -0,0 +1,75 @@
"""Helpers to check build environment before actual build of scikit-learn"""
import glob
import os
import subprocess
import sys
import tempfile
import textwrap
from setuptools.command.build_ext import customize_compiler, new_compiler
def compile_test_program(code, extra_preargs=None, extra_postargs=None):
"""Check that some C code can be compiled and run"""
ccompiler = new_compiler()
customize_compiler(ccompiler)
start_dir = os.path.abspath(".")
with tempfile.TemporaryDirectory() as tmp_dir:
try:
os.chdir(tmp_dir)
# Write test program
with open("test_program.c", "w") as f:
f.write(code)
os.mkdir("objects")
# Compile, test program
ccompiler.compile(
["test_program.c"], output_dir="objects", extra_postargs=extra_postargs
)
# Link test program
objects = glob.glob(os.path.join("objects", "*" + ccompiler.obj_extension))
ccompiler.link_executable(
objects,
"test_program",
extra_preargs=extra_preargs,
extra_postargs=extra_postargs,
)
if "PYTHON_CROSSENV" not in os.environ:
# Run test program if not cross compiling
# will raise a CalledProcessError if return code was non-zero
output = subprocess.check_output("./test_program")
output = output.decode(sys.stdout.encoding or "utf-8").splitlines()
else:
# Return an empty output if we are cross compiling
# as we cannot run the test_program
output = []
except Exception:
raise
finally:
os.chdir(start_dir)
return output
def basic_check_build():
"""Check basic compilation and linking of C code"""
if "PYODIDE" in os.environ:
# The following check won't work in pyodide
return
code = textwrap.dedent(
"""\
#include <stdio.h>
int main(void) {
return 0;
}
"""
)
compile_test_program(code)
@@ -0,0 +1,57 @@
import argparse
import os
from Cython import Tempita as tempita
# XXX: If this import ever fails (does it really?), vendor either
# cython.tempita or numpy/npy_tempita.
def process_tempita(fromfile, outfile=None):
"""Process tempita templated file and write out the result.
The template file is expected to end in `.c.tp` or `.pyx.tp`:
E.g. processing `template.c.in` generates `template.c`.
"""
with open(fromfile, "r", encoding="utf-8") as f:
template_content = f.read()
template = tempita.Template(template_content)
content = template.substitute()
with open(outfile, "w", encoding="utf-8") as f:
f.write(content)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("infile", type=str, help="Path to the input file")
parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory")
parser.add_argument(
"-i",
"--ignore",
type=str,
help=(
"An ignored input - may be useful to add a "
"dependency between custom targets"
),
)
args = parser.parse_args()
if not args.infile.endswith(".tp"):
raise ValueError(f"Unexpected extension: {args.infile}")
if not args.outdir:
raise ValueError("Missing `--outdir` argument to tempita.py")
outdir_abs = os.path.join(os.getcwd(), args.outdir)
outfile = os.path.join(
outdir_abs, os.path.splitext(os.path.split(args.infile)[1])[0]
)
process_tempita(args.infile, outfile)
if __name__ == "__main__":
main()
@@ -0,0 +1,13 @@
#!/usr/bin/env python
"""Extract version number from __init__.py"""
import os
sklearn_init = os.path.join(os.path.dirname(__file__), "../__init__.py")
data = open(sklearn_init).readlines()
version_line = next(line for line in data if line.startswith("__version__"))
version = version_line.strip().split(" = ")[1].replace('"', "").replace("'", "")
print(version)
@@ -0,0 +1,373 @@
"""Global configuration state and functions for management"""
import os
import threading
from contextlib import contextmanager as contextmanager
_global_config = {
"assume_finite": bool(os.environ.get("SKLEARN_ASSUME_FINITE", False)),
"working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
"print_changed_only": True,
"display": "diagram",
"pairwise_dist_chunk_size": int(
os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256)
),
"enable_cython_pairwise_dist": True,
"array_api_dispatch": False,
"transform_output": "default",
"enable_metadata_routing": False,
"skip_parameter_validation": False,
}
_threadlocal = threading.local()
def _get_threadlocal_config():
"""Get a threadlocal **mutable** configuration. If the configuration
does not exist, copy the default global configuration."""
if not hasattr(_threadlocal, "global_config"):
_threadlocal.global_config = _global_config.copy()
return _threadlocal.global_config
def get_config():
"""Retrieve current values for configuration set by :func:`set_config`.
Returns
-------
config : dict
Keys are parameter names that can be passed to :func:`set_config`.
See Also
--------
config_context : Context manager for global scikit-learn configuration.
set_config : Set global scikit-learn configuration.
Examples
--------
>>> import sklearn
>>> config = sklearn.get_config()
>>> config.keys()
dict_keys([...])
"""
# Return a copy of the threadlocal configuration so that users will
# not be able to modify the configuration with the returned dict.
return _get_threadlocal_config().copy()
def set_config(
assume_finite=None,
working_memory=None,
print_changed_only=None,
display=None,
pairwise_dist_chunk_size=None,
enable_cython_pairwise_dist=None,
array_api_dispatch=None,
transform_output=None,
enable_metadata_routing=None,
skip_parameter_validation=None,
):
"""Set global scikit-learn configuration.
.. versionadded:: 0.19
Parameters
----------
assume_finite : bool, default=None
If True, validation for finiteness will be skipped,
saving time, but leading to potential crashes. If
False, validation for finiteness will be performed,
avoiding error. Global default: False.
.. versionadded:: 0.19
working_memory : int, default=None
If set, scikit-learn will attempt to limit the size of temporary arrays
to this number of MiB (per job when parallelised), often saving both
computation time and memory on expensive operations that can be
performed in chunks. Global default: 1024.
.. versionadded:: 0.20
print_changed_only : bool, default=None
If True, only the parameters that were set to non-default
values will be printed when printing an estimator. For example,
``print(SVC())`` while True will only print 'SVC()' while the default
behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with
all the non-changed parameters.
.. versionadded:: 0.21
display : {'text', 'diagram'}, default=None
If 'diagram', estimators will be displayed as a diagram in a Jupyter
lab or notebook context. If 'text', estimators will be displayed as
text. Default is 'diagram'.
.. versionadded:: 0.23
pairwise_dist_chunk_size : int, default=None
The number of row vectors per chunk for the accelerated pairwise-
distances reduction backend. Default is 256 (suitable for most of
modern laptops' caches and architectures).
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
enable_cython_pairwise_dist : bool, default=None
Use the accelerated pairwise-distances reduction backend when
possible. Global default: True.
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
array_api_dispatch : bool, default=None
Use Array API dispatching when inputs follow the Array API standard.
Default is False.
See the :ref:`User Guide <array_api>` for more details.
.. versionadded:: 1.2
transform_output : str, default=None
Configure output of `transform` and `fit_transform`.
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
for an example on how to use the API.
- `"default"`: Default output format of a transformer
- `"pandas"`: DataFrame output
- `"polars"`: Polars output
- `None`: Transform configuration is unchanged
.. versionadded:: 1.2
.. versionadded:: 1.4
`"polars"` option was added.
enable_metadata_routing : bool, default=None
Enable metadata routing. By default this feature is disabled.
Refer to :ref:`metadata routing user guide <metadata_routing>` for more
details.
- `True`: Metadata routing is enabled
- `False`: Metadata routing is disabled, use the old syntax.
- `None`: Configuration is unchanged
.. versionadded:: 1.3
skip_parameter_validation : bool, default=None
If `True`, disable the validation of the hyper-parameters' types and values in
the fit method of estimators and for arguments passed to public helper
functions. It can save time in some situations but can lead to low level
crashes and exceptions with confusing error messages.
Note that for data parameters, such as `X` and `y`, only type validation is
skipped but validation with `check_array` will continue to run.
.. versionadded:: 1.3
See Also
--------
config_context : Context manager for global scikit-learn configuration.
get_config : Retrieve current values of the global configuration.
Examples
--------
>>> from sklearn import set_config
>>> set_config(display='diagram') # doctest: +SKIP
"""
local_config = _get_threadlocal_config()
if assume_finite is not None:
local_config["assume_finite"] = assume_finite
if working_memory is not None:
local_config["working_memory"] = working_memory
if print_changed_only is not None:
local_config["print_changed_only"] = print_changed_only
if display is not None:
local_config["display"] = display
if pairwise_dist_chunk_size is not None:
local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size
if enable_cython_pairwise_dist is not None:
local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist
if array_api_dispatch is not None:
from .utils._array_api import _check_array_api_dispatch
_check_array_api_dispatch(array_api_dispatch)
local_config["array_api_dispatch"] = array_api_dispatch
if transform_output is not None:
local_config["transform_output"] = transform_output
if enable_metadata_routing is not None:
local_config["enable_metadata_routing"] = enable_metadata_routing
if skip_parameter_validation is not None:
local_config["skip_parameter_validation"] = skip_parameter_validation
@contextmanager
def config_context(
*,
assume_finite=None,
working_memory=None,
print_changed_only=None,
display=None,
pairwise_dist_chunk_size=None,
enable_cython_pairwise_dist=None,
array_api_dispatch=None,
transform_output=None,
enable_metadata_routing=None,
skip_parameter_validation=None,
):
"""Context manager for global scikit-learn configuration.
Parameters
----------
assume_finite : bool, default=None
If True, validation for finiteness will be skipped,
saving time, but leading to potential crashes. If
False, validation for finiteness will be performed,
avoiding error. If None, the existing value won't change.
The default value is False.
working_memory : int, default=None
If set, scikit-learn will attempt to limit the size of temporary arrays
to this number of MiB (per job when parallelised), often saving both
computation time and memory on expensive operations that can be
performed in chunks. If None, the existing value won't change.
The default value is 1024.
print_changed_only : bool, default=None
If True, only the parameters that were set to non-default
values will be printed when printing an estimator. For example,
``print(SVC())`` while True will only print 'SVC()', but would print
'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
when False. If None, the existing value won't change.
The default value is True.
.. versionchanged:: 0.23
Default changed from False to True.
display : {'text', 'diagram'}, default=None
If 'diagram', estimators will be displayed as a diagram in a Jupyter
lab or notebook context. If 'text', estimators will be displayed as
text. If None, the existing value won't change.
The default value is 'diagram'.
.. versionadded:: 0.23
pairwise_dist_chunk_size : int, default=None
The number of row vectors per chunk for the accelerated pairwise-
distances reduction backend. Default is 256 (suitable for most of
modern laptops' caches and architectures).
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
enable_cython_pairwise_dist : bool, default=None
Use the accelerated pairwise-distances reduction backend when
possible. Global default: True.
Intended for easier benchmarking and testing of scikit-learn internals.
End users are not expected to benefit from customizing this configuration
setting.
.. versionadded:: 1.1
array_api_dispatch : bool, default=None
Use Array API dispatching when inputs follow the Array API standard.
Default is False.
See the :ref:`User Guide <array_api>` for more details.
.. versionadded:: 1.2
transform_output : str, default=None
Configure output of `transform` and `fit_transform`.
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
for an example on how to use the API.
- `"default"`: Default output format of a transformer
- `"pandas"`: DataFrame output
- `"polars"`: Polars output
- `None`: Transform configuration is unchanged
.. versionadded:: 1.2
.. versionadded:: 1.4
`"polars"` option was added.
enable_metadata_routing : bool, default=None
Enable metadata routing. By default this feature is disabled.
Refer to :ref:`metadata routing user guide <metadata_routing>` for more
details.
- `True`: Metadata routing is enabled
- `False`: Metadata routing is disabled, use the old syntax.
- `None`: Configuration is unchanged
.. versionadded:: 1.3
skip_parameter_validation : bool, default=None
If `True`, disable the validation of the hyper-parameters' types and values in
the fit method of estimators and for arguments passed to public helper
functions. It can save time in some situations but can lead to low level
crashes and exceptions with confusing error messages.
Note that for data parameters, such as `X` and `y`, only type validation is
skipped but validation with `check_array` will continue to run.
.. versionadded:: 1.3
Yields
------
None.
See Also
--------
set_config : Set global scikit-learn configuration.
get_config : Retrieve current values of the global configuration.
Notes
-----
All settings, not just those presently modified, will be returned to
their previous values when the context manager is exited.
Examples
--------
>>> import sklearn
>>> from sklearn.utils.validation import assert_all_finite
>>> with sklearn.config_context(assume_finite=True):
... assert_all_finite([float('nan')])
>>> with sklearn.config_context(assume_finite=True):
... with sklearn.config_context(assume_finite=False):
... assert_all_finite([float('nan')])
Traceback (most recent call last):
...
ValueError: Input contains NaN...
"""
old_config = get_config()
set_config(
assume_finite=assume_finite,
working_memory=working_memory,
print_changed_only=print_changed_only,
display=display,
pairwise_dist_chunk_size=pairwise_dist_chunk_size,
enable_cython_pairwise_dist=enable_cython_pairwise_dist,
array_api_dispatch=array_api_dispatch,
transform_output=transform_output,
enable_metadata_routing=enable_metadata_routing,
skip_parameter_validation=skip_parameter_validation,
)
try:
yield
finally:
set_config(**old_config)
@@ -0,0 +1,10 @@
"""Distributor init file
Distributors: you can add custom code here to support particular distributions
of scikit-learn.
For example, this is a good place to put any checks for hardware requirements.
The scikit-learn standard source distribution will not put code in this file,
so you can safely replace this file with your own version.
"""
@@ -0,0 +1,115 @@
# Author: Nelle Varoquaux, Andrew Tulloch, Antony Lee
# Uses the pool adjacent violators algorithm (PAVA), with the
# enhancement of searching for the longest decreasing subsequence to
# pool at each step.
import numpy as np
from cython cimport floating
def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
cdef:
Py_ssize_t n = y.shape[0], i, k
floating prev_y, sum_wy, sum_w
Py_ssize_t[::1] target = np.arange(n, dtype=np.intp)
# target describes a list of blocks. At any time, if [i..j] (inclusive) is
# an active block, then target[i] := j and target[j] := i.
# For "active" indices (block starts):
# w[i] := sum{w_orig[j], j=[i..target[i]]}
# y[i] := sum{y_orig[j]*w_orig[j], j=[i..target[i]]} / w[i]
with nogil:
i = 0
while i < n:
k = target[i] + 1
if k == n:
break
if y[i] < y[k]:
i = k
continue
sum_wy = w[i] * y[i]
sum_w = w[i]
while True:
# We are within a decreasing subsequence.
prev_y = y[k]
sum_wy += w[k] * y[k]
sum_w += w[k]
k = target[k] + 1
if k == n or prev_y < y[k]:
# Non-singleton decreasing subsequence is finished,
# update first entry.
y[i] = sum_wy / sum_w
w[i] = sum_w
target[i] = k - 1
target[k - 1] = i
if i > 0:
# Backtrack if we can. This makes the algorithm
# single-pass and ensures O(n) complexity.
i = target[i - 1]
# Otherwise, restart from the same point.
break
# Reconstruct the solution.
i = 0
while i < n:
k = target[i] + 1
y[i + 1 : k] = y[i]
i = k
def _make_unique(const floating[::1] X,
const floating[::1] y,
const floating[::1] sample_weights):
"""Average targets for duplicate X, drop duplicates.
Aggregates duplicate X values into a single X value where
the target y is a (sample_weighted) average of the individual
targets.
Assumes that X is ordered, so that all duplicates follow each other.
"""
unique_values = len(np.unique(X))
if floating is float:
dtype = np.float32
else:
dtype = np.float64
cdef floating[::1] y_out = np.empty(unique_values, dtype=dtype)
cdef floating[::1] x_out = np.empty_like(y_out)
cdef floating[::1] weights_out = np.empty_like(y_out)
cdef floating current_x = X[0]
cdef floating current_y = 0
cdef floating current_weight = 0
cdef int i = 0
cdef int j
cdef floating x
cdef int n_samples = len(X)
cdef floating eps = np.finfo(dtype).resolution
for j in range(n_samples):
x = X[j]
if x - current_x >= eps:
# next unique value
x_out[i] = current_x
weights_out[i] = current_weight
y_out[i] = current_y / current_weight
i += 1
current_x = x
current_weight = sample_weights[j]
current_y = y[j] * sample_weights[j]
else:
current_weight += sample_weights[j]
current_y += y[j] * sample_weights[j]
x_out[i] = current_x
weights_out[i] = current_weight
y_out[i] = current_y / current_weight
return(
np.asarray(x_out[:i+1]),
np.asarray(y_out[:i+1]),
np.asarray(weights_out[:i+1]),
)
@@ -0,0 +1,30 @@
"""
The :mod:`sklearn._loss` module includes loss function classes suitable for
fitting classification and regression tasks.
"""
from .loss import (
AbsoluteError,
HalfBinomialLoss,
HalfGammaLoss,
HalfMultinomialLoss,
HalfPoissonLoss,
HalfSquaredError,
HalfTweedieLoss,
HalfTweedieLossIdentity,
HuberLoss,
PinballLoss,
)
__all__ = [
"HalfSquaredError",
"AbsoluteError",
"PinballLoss",
"HuberLoss",
"HalfPoissonLoss",
"HalfGammaLoss",
"HalfTweedieLoss",
"HalfTweedieLossIdentity",
"HalfBinomialLoss",
"HalfMultinomialLoss",
]
@@ -0,0 +1,91 @@
# Fused types for input like y_true, raw_prediction, sample_weights.
ctypedef fused floating_in:
double
float
# Fused types for output like gradient and hessian
# We use a different fused types for input (floating_in) and output (floating_out), such
# that input and output can have different dtypes in the same function call. A single
# fused type can only take on one single value (type) for all arguments in one function
# call.
ctypedef fused floating_out:
double
float
# Struct to return 2 doubles
ctypedef struct double_pair:
double val1
double val2
# C base class for loss functions
cdef class CyLossFunction:
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfSquaredError(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyAbsoluteError(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyPinballLoss(CyLossFunction):
cdef readonly double quantile # readonly makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHuberLoss(CyLossFunction):
cdef public double delta # public makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfPoissonLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfGammaLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfTweedieLoss(CyLossFunction):
cdef readonly double power # readonly makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfTweedieLossIdentity(CyLossFunction):
cdef readonly double power # readonly makes it accessible from Python
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyHalfBinomialLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
cdef class CyExponentialLoss(CyLossFunction):
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,281 @@
"""
Module contains classes for invertible (and differentiable) link functions.
"""
# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
from abc import ABC, abstractmethod
from dataclasses import dataclass
import numpy as np
from scipy.special import expit, logit
from scipy.stats import gmean
from ..utils.extmath import softmax
@dataclass
class Interval:
low: float
high: float
low_inclusive: bool
high_inclusive: bool
def __post_init__(self):
"""Check that low <= high"""
if self.low > self.high:
raise ValueError(
f"One must have low <= high; got low={self.low}, high={self.high}."
)
def includes(self, x):
"""Test whether all values of x are in interval range.
Parameters
----------
x : ndarray
Array whose elements are tested to be in interval range.
Returns
-------
result : bool
"""
if self.low_inclusive:
low = np.greater_equal(x, self.low)
else:
low = np.greater(x, self.low)
if not np.all(low):
return False
if self.high_inclusive:
high = np.less_equal(x, self.high)
else:
high = np.less(x, self.high)
# Note: np.all returns numpy.bool_
return bool(np.all(high))
def _inclusive_low_high(interval, dtype=np.float64):
"""Generate values low and high to be within the interval range.
This is used in tests only.
Returns
-------
low, high : tuple
The returned values low and high lie within the interval.
"""
eps = 10 * np.finfo(dtype).eps
if interval.low == -np.inf:
low = -1e10
elif interval.low < 0:
low = interval.low * (1 - eps) + eps
else:
low = interval.low * (1 + eps) + eps
if interval.high == np.inf:
high = 1e10
elif interval.high < 0:
high = interval.high * (1 + eps) - eps
else:
high = interval.high * (1 - eps) - eps
return low, high
class BaseLink(ABC):
"""Abstract base class for differentiable, invertible link functions.
Convention:
- link function g: raw_prediction = g(y_pred)
- inverse link h: y_pred = h(raw_prediction)
For (generalized) linear models, `raw_prediction = X @ coef` is the so
called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
conditional (on X) expected value of the target `y_true`.
The methods are not implemented as staticmethods in case a link function needs
parameters.
"""
is_multiclass = False # used for testing only
# Usually, raw_prediction may be any real number and y_pred is an open
# interval.
# interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
interval_y_pred = Interval(-np.inf, np.inf, False, False)
@abstractmethod
def link(self, y_pred, out=None):
"""Compute the link function g(y_pred).
The link function maps (predicted) target values to raw predictions,
i.e. `g(y_pred) = raw_prediction`.
Parameters
----------
y_pred : array
Predicted target values.
out : array
A location into which the result is stored. If provided, it must
have a shape that the inputs broadcast to. If not provided or None,
a freshly-allocated array is returned.
Returns
-------
out : array
Output array, element-wise link function.
"""
@abstractmethod
def inverse(self, raw_prediction, out=None):
"""Compute the inverse link function h(raw_prediction).
The inverse link function maps raw predictions to predicted target
values, i.e. `h(raw_prediction) = y_pred`.
Parameters
----------
raw_prediction : array
Raw prediction values (in link space).
out : array
A location into which the result is stored. If provided, it must
have a shape that the inputs broadcast to. If not provided or None,
a freshly-allocated array is returned.
Returns
-------
out : array
Output array, element-wise inverse link function.
"""
class IdentityLink(BaseLink):
"""The identity link function g(x)=x."""
def link(self, y_pred, out=None):
if out is not None:
np.copyto(out, y_pred)
return out
else:
return y_pred
inverse = link
class LogLink(BaseLink):
"""The log link function g(x)=log(x)."""
interval_y_pred = Interval(0, np.inf, False, False)
def link(self, y_pred, out=None):
return np.log(y_pred, out=out)
def inverse(self, raw_prediction, out=None):
return np.exp(raw_prediction, out=out)
class LogitLink(BaseLink):
"""The logit link function g(x)=logit(x)."""
interval_y_pred = Interval(0, 1, False, False)
def link(self, y_pred, out=None):
return logit(y_pred, out=out)
def inverse(self, raw_prediction, out=None):
return expit(raw_prediction, out=out)
class HalfLogitLink(BaseLink):
"""Half the logit link function g(x)=1/2 * logit(x).
Used for the exponential loss.
"""
interval_y_pred = Interval(0, 1, False, False)
def link(self, y_pred, out=None):
out = logit(y_pred, out=out)
out *= 0.5
return out
def inverse(self, raw_prediction, out=None):
return expit(2 * raw_prediction, out)
class MultinomialLogit(BaseLink):
"""The symmetric multinomial logit function.
Convention:
- y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
Notes:
- The inverse link h is the softmax function.
- The sum is over the second axis, i.e. axis=1 (n_classes).
We have to choose additional constraints in order to make
y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1)
for n_classes classes identifiable and invertible.
We choose the symmetric side constraint where the geometric mean response
is set as reference category, see [2]:
The symmetric multinomial logit link function for a single data point is
then defined as
raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
= log(y_pred[k]) - mean(log(y_pred)).
Note that this is equivalent to the definition in [1] and implies mean
centered raw predictions:
sum(raw_prediction[k], k=0..n_classes-1) = 0.
For linear models with raw_prediction = X @ coef, this corresponds to
sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
feature is zero.
Reference
---------
.. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
logistic regression: a statistical view of boosting" Ann. Statist.
28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
https://projecteuclid.org/euclid.aos/1016218223
.. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
multinomial logit models with symmetric side constraints."
Computational Statistics 28 (2013): 1017-1034.
http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
"""
is_multiclass = True
interval_y_pred = Interval(0, 1, False, False)
def symmetrize_raw_prediction(self, raw_prediction):
return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
def link(self, y_pred, out=None):
# geometric mean as reference category
gm = gmean(y_pred, axis=1)
return np.log(y_pred / gm[:, np.newaxis], out=out)
def inverse(self, raw_prediction, out=None):
if out is None:
return softmax(raw_prediction, copy=True)
else:
np.copyto(out, raw_prediction)
softmax(out, copy=False)
return out
_LINKS = {
"identity": IdentityLink,
"log": LogLink,
"logit": LogitLink,
"half_logit": HalfLogitLink,
"multinomial_logit": MultinomialLogit,
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,19 @@
# .pyx is generated, so this is needed to make Cython compilation work
_loss_cython_tree = [
fs.copyfile('_loss.pxd')
]
_loss_pyx = custom_target(
'_loss_pyx',
output: '_loss.pyx',
input: '_loss.pyx.tp',
command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'],
)
py.extension_module(
'_loss',
[_loss_pyx, _loss_cython_tree],
cython_args: cython_args,
install: true,
subdir: 'sklearn/_loss',
)
@@ -0,0 +1,111 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from sklearn._loss.link import (
_LINKS,
HalfLogitLink,
Interval,
MultinomialLogit,
_inclusive_low_high,
)
LINK_FUNCTIONS = list(_LINKS.values())
def test_interval_raises():
"""Test that interval with low > high raises ValueError."""
with pytest.raises(
ValueError, match="One must have low <= high; got low=1, high=0."
):
Interval(1, 0, False, False)
@pytest.mark.parametrize(
"interval",
[
Interval(0, 1, False, False),
Interval(0, 1, False, True),
Interval(0, 1, True, False),
Interval(0, 1, True, True),
Interval(-np.inf, np.inf, False, False),
Interval(-np.inf, np.inf, False, True),
Interval(-np.inf, np.inf, True, False),
Interval(-np.inf, np.inf, True, True),
Interval(-10, -1, False, False),
Interval(-10, -1, False, True),
Interval(-10, -1, True, False),
Interval(-10, -1, True, True),
],
)
def test_is_in_range(interval):
# make sure low and high are always within the interval, used for linspace
low, high = _inclusive_low_high(interval)
x = np.linspace(low, high, num=10)
assert interval.includes(x)
# x contains lower bound
assert interval.includes(np.r_[x, interval.low]) == interval.low_inclusive
# x contains upper bound
assert interval.includes(np.r_[x, interval.high]) == interval.high_inclusive
# x contains upper and lower bound
assert interval.includes(np.r_[x, interval.low, interval.high]) == (
interval.low_inclusive and interval.high_inclusive
)
@pytest.mark.parametrize("link", LINK_FUNCTIONS)
def test_link_inverse_identity(link, global_random_seed):
# Test that link of inverse gives identity.
rng = np.random.RandomState(global_random_seed)
link = link()
n_samples, n_classes = 100, None
# The values for `raw_prediction` are limited from -20 to 20 because in the
# class `LogitLink` the term `expit(x)` comes very close to 1 for large
# positive x and therefore loses precision.
if link.is_multiclass:
n_classes = 10
raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples, n_classes))
if isinstance(link, MultinomialLogit):
raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
elif isinstance(link, HalfLogitLink):
raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
else:
raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples))
assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
y_pred = link.inverse(raw_prediction)
assert_allclose(link.inverse(link.link(y_pred)), y_pred)
@pytest.mark.parametrize("link", LINK_FUNCTIONS)
def test_link_out_argument(link):
# Test that out argument gets assigned the result.
rng = np.random.RandomState(42)
link = link()
n_samples, n_classes = 100, None
if link.is_multiclass:
n_classes = 10
raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
if isinstance(link, MultinomialLogit):
raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
else:
# So far, the valid interval of raw_prediction is (-inf, inf) and
# we do not need to distinguish.
raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
y_pred = link.inverse(raw_prediction, out=None)
out = np.empty_like(raw_prediction)
y_pred_2 = link.inverse(raw_prediction, out=out)
assert_allclose(y_pred, out)
assert_array_equal(out, y_pred_2)
assert np.shares_memory(out, y_pred_2)
out = np.empty_like(y_pred)
raw_prediction_2 = link.link(y_pred, out=out)
assert_allclose(raw_prediction, out)
assert_array_equal(out, raw_prediction_2)
assert np.shares_memory(out, raw_prediction_2)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,67 @@
"""All minimum dependencies for scikit-learn."""
import argparse
from collections import defaultdict
# scipy and cython should by in sync with pyproject.toml
NUMPY_MIN_VERSION = "1.19.5"
SCIPY_MIN_VERSION = "1.6.0"
JOBLIB_MIN_VERSION = "1.2.0"
THREADPOOLCTL_MIN_VERSION = "3.1.0"
PYTEST_MIN_VERSION = "7.1.2"
CYTHON_MIN_VERSION = "3.0.10"
# 'build' and 'install' is included to have structured metadata for CI.
# It will NOT be included in setup's extras_require
# The values are (version_spec, comma separated tags)
dependent_packages = {
"numpy": (NUMPY_MIN_VERSION, "build, install"),
"scipy": (SCIPY_MIN_VERSION, "build, install"),
"joblib": (JOBLIB_MIN_VERSION, "install"),
"threadpoolctl": (THREADPOOLCTL_MIN_VERSION, "install"),
"cython": (CYTHON_MIN_VERSION, "build"),
"meson-python": ("0.15.0", "build"),
"matplotlib": ("3.3.4", "benchmark, docs, examples, tests"),
"scikit-image": ("0.17.2", "docs, examples, tests"),
"pandas": ("1.1.5", "benchmark, docs, examples, tests"),
"seaborn": ("0.9.0", "docs, examples"),
"memory_profiler": ("0.57.0", "benchmark, docs"),
"pytest": (PYTEST_MIN_VERSION, "tests"),
"pytest-cov": ("2.9.0", "tests"),
"ruff": ("0.2.1", "tests"),
"black": ("24.3.0", "tests"),
"mypy": ("1.9", "tests"),
"pyamg": ("4.0.0", "tests"),
"polars": ("0.20.23", "docs, tests"),
"pyarrow": ("12.0.0", "tests"),
"sphinx": ("6.0.0", "docs"),
"sphinx-copybutton": ("0.5.2", "docs"),
"sphinx-gallery": ("0.15.0", "docs"),
"numpydoc": ("1.2.0", "docs, tests"),
"Pillow": ("7.1.2", "docs"),
"pooch": ("1.6.0", "docs, examples, tests"),
"sphinx-prompt": ("1.3.0", "docs"),
"sphinxext-opengraph": ("0.4.2", "docs"),
"plotly": ("5.14.0", "docs, examples"),
# XXX: Pin conda-lock to the latest released version (needs manual update
# from time to time)
"conda-lock": ("2.5.6", "maintenance"),
}
# create inverse mapping for setuptools
tag_to_packages: dict = defaultdict(list)
for package, (min_version, extras) in dependent_packages.items():
for extra in extras.split(", "):
tag_to_packages[extra].append("{}>={}".format(package, min_version))
# Used by CI to get the min dependencies
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Get min dependencies for a package")
parser.add_argument("package", choices=dependent_packages)
args = parser.parse_args()
min_version = dependent_packages[args.package][0]
print(min_version)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,56 @@
"""
The :mod:`sklearn.cluster` module gathers popular unsupervised clustering
algorithms.
"""
from ._affinity_propagation import AffinityPropagation, affinity_propagation
from ._agglomerative import (
AgglomerativeClustering,
FeatureAgglomeration,
linkage_tree,
ward_tree,
)
from ._bicluster import SpectralBiclustering, SpectralCoclustering
from ._birch import Birch
from ._bisect_k_means import BisectingKMeans
from ._dbscan import DBSCAN, dbscan
from ._hdbscan.hdbscan import HDBSCAN
from ._kmeans import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus
from ._mean_shift import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift
from ._optics import (
OPTICS,
cluster_optics_dbscan,
cluster_optics_xi,
compute_optics_graph,
)
from ._spectral import SpectralClustering, spectral_clustering
__all__ = [
"AffinityPropagation",
"AgglomerativeClustering",
"Birch",
"DBSCAN",
"OPTICS",
"cluster_optics_dbscan",
"cluster_optics_xi",
"compute_optics_graph",
"KMeans",
"BisectingKMeans",
"FeatureAgglomeration",
"MeanShift",
"MiniBatchKMeans",
"SpectralClustering",
"affinity_propagation",
"dbscan",
"estimate_bandwidth",
"get_bin_seeds",
"k_means",
"kmeans_plusplus",
"linkage_tree",
"mean_shift",
"spectral_clustering",
"ward_tree",
"SpectralBiclustering",
"SpectralCoclustering",
"HDBSCAN",
]
@@ -0,0 +1,604 @@
"""Affinity Propagation clustering algorithm."""
# Author: Alexandre Gramfort alexandre.gramfort@inria.fr
# Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD 3 clause
import warnings
from numbers import Integral, Real
import numpy as np
from .._config import config_context
from ..base import BaseEstimator, ClusterMixin, _fit_context
from ..exceptions import ConvergenceWarning
from ..metrics import euclidean_distances, pairwise_distances_argmin
from ..utils import check_random_state
from ..utils._param_validation import Interval, StrOptions, validate_params
from ..utils.validation import check_is_fitted
def _equal_similarities_and_preferences(S, preference):
def all_equal_preferences():
return np.all(preference == preference.flat[0])
def all_equal_similarities():
# Create mask to ignore diagonal of S
mask = np.ones(S.shape, dtype=bool)
np.fill_diagonal(mask, 0)
return np.all(S[mask].flat == S[mask].flat[0])
return all_equal_preferences() and all_equal_similarities()
def _affinity_propagation(
S,
*,
preference,
convergence_iter,
max_iter,
damping,
verbose,
return_n_iter,
random_state,
):
"""Main affinity propagation algorithm."""
n_samples = S.shape[0]
if n_samples == 1 or _equal_similarities_and_preferences(S, preference):
# It makes no sense to run the algorithm in this case, so return 1 or
# n_samples clusters, depending on preferences
warnings.warn(
"All samples have mutually equal similarities. "
"Returning arbitrary cluster center(s)."
)
if preference.flat[0] > S.flat[n_samples - 1]:
return (
(np.arange(n_samples), np.arange(n_samples), 0)
if return_n_iter
else (np.arange(n_samples), np.arange(n_samples))
)
else:
return (
(np.array([0]), np.array([0] * n_samples), 0)
if return_n_iter
else (np.array([0]), np.array([0] * n_samples))
)
# Place preference on the diagonal of S
S.flat[:: (n_samples + 1)] = preference
A = np.zeros((n_samples, n_samples))
R = np.zeros((n_samples, n_samples)) # Initialize messages
# Intermediate results
tmp = np.zeros((n_samples, n_samples))
# Remove degeneracies
S += (
np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100
) * random_state.standard_normal(size=(n_samples, n_samples))
# Execute parallel affinity propagation updates
e = np.zeros((n_samples, convergence_iter))
ind = np.arange(n_samples)
for it in range(max_iter):
# tmp = A + S; compute responsibilities
np.add(A, S, tmp)
I = np.argmax(tmp, axis=1)
Y = tmp[ind, I] # np.max(A + S, axis=1)
tmp[ind, I] = -np.inf
Y2 = np.max(tmp, axis=1)
# tmp = Rnew
np.subtract(S, Y[:, None], tmp)
tmp[ind, I] = S[ind, I] - Y2
# Damping
tmp *= 1 - damping
R *= damping
R += tmp
# tmp = Rp; compute availabilities
np.maximum(R, 0, tmp)
tmp.flat[:: n_samples + 1] = R.flat[:: n_samples + 1]
# tmp = -Anew
tmp -= np.sum(tmp, axis=0)
dA = np.diag(tmp).copy()
tmp.clip(0, np.inf, tmp)
tmp.flat[:: n_samples + 1] = dA
# Damping
tmp *= 1 - damping
A *= damping
A -= tmp
# Check for convergence
E = (np.diag(A) + np.diag(R)) > 0
e[:, it % convergence_iter] = E
K = np.sum(E, axis=0)
if it >= convergence_iter:
se = np.sum(e, axis=1)
unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples
if (not unconverged and (K > 0)) or (it == max_iter):
never_converged = False
if verbose:
print("Converged after %d iterations." % it)
break
else:
never_converged = True
if verbose:
print("Did not converge")
I = np.flatnonzero(E)
K = I.size # Identify exemplars
if K > 0:
if never_converged:
warnings.warn(
(
"Affinity propagation did not converge, this model "
"may return degenerate cluster centers and labels."
),
ConvergenceWarning,
)
c = np.argmax(S[:, I], axis=1)
c[I] = np.arange(K) # Identify clusters
# Refine the final set of exemplars and clusters and return results
for k in range(K):
ii = np.where(c == k)[0]
j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
I[k] = ii[j]
c = np.argmax(S[:, I], axis=1)
c[I] = np.arange(K)
labels = I[c]
# Reduce labels to a sorted, gapless, list
cluster_centers_indices = np.unique(labels)
labels = np.searchsorted(cluster_centers_indices, labels)
else:
warnings.warn(
(
"Affinity propagation did not converge and this model "
"will not have any cluster centers."
),
ConvergenceWarning,
)
labels = np.array([-1] * n_samples)
cluster_centers_indices = []
if return_n_iter:
return cluster_centers_indices, labels, it + 1
else:
return cluster_centers_indices, labels
###############################################################################
# Public API
@validate_params(
{
"S": ["array-like"],
"return_n_iter": ["boolean"],
},
prefer_skip_nested_validation=False,
)
def affinity_propagation(
S,
*,
preference=None,
convergence_iter=15,
max_iter=200,
damping=0.5,
copy=True,
verbose=False,
return_n_iter=False,
random_state=None,
):
"""Perform Affinity Propagation Clustering of data.
Read more in the :ref:`User Guide <affinity_propagation>`.
Parameters
----------
S : array-like of shape (n_samples, n_samples)
Matrix of similarities between points.
preference : array-like of shape (n_samples,) or float, default=None
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number of
exemplars, i.e. of clusters, is influenced by the input preferences
value. If the preferences are not passed as arguments, they will be
set to the median of the input similarities (resulting in a moderate
number of clusters). For a smaller amount of clusters, this can be set
to the minimum value of the similarities.
convergence_iter : int, default=15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.
max_iter : int, default=200
Maximum number of iterations.
damping : float, default=0.5
Damping factor between 0.5 and 1.
copy : bool, default=True
If copy is False, the affinity matrix is modified inplace by the
algorithm, for memory efficiency.
verbose : bool, default=False
The verbosity level.
return_n_iter : bool, default=False
Whether or not to return the number of iterations.
random_state : int, RandomState instance or None, default=None
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.
.. versionadded:: 0.23
this parameter was previously hardcoded as 0.
Returns
-------
cluster_centers_indices : ndarray of shape (n_clusters,)
Index of clusters centers.
labels : ndarray of shape (n_samples,)
Cluster labels for each point.
n_iter : int
Number of iterations run. Returned only if `return_n_iter` is
set to True.
Notes
-----
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
When the algorithm does not converge, it will still return a arrays of
``cluster_center_indices`` and labels if there are any exemplars/clusters,
however they may be degenerate and should be used with caution.
When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, a single cluster center
and label ``0`` for every sample will be returned. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.
References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
Examples
--------
>>> import numpy as np
>>> from sklearn.cluster import affinity_propagation
>>> from sklearn.metrics.pairwise import euclidean_distances
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [4, 2], [4, 4], [4, 0]])
>>> S = -euclidean_distances(X, squared=True)
>>> cluster_centers_indices, labels = affinity_propagation(S, random_state=0)
>>> cluster_centers_indices
array([0, 3])
>>> labels
array([0, 0, 0, 1, 1, 1])
"""
estimator = AffinityPropagation(
damping=damping,
max_iter=max_iter,
convergence_iter=convergence_iter,
copy=copy,
preference=preference,
affinity="precomputed",
verbose=verbose,
random_state=random_state,
).fit(S)
if return_n_iter:
return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_
return estimator.cluster_centers_indices_, estimator.labels_
class AffinityPropagation(ClusterMixin, BaseEstimator):
"""Perform Affinity Propagation Clustering of data.
Read more in the :ref:`User Guide <affinity_propagation>`.
Parameters
----------
damping : float, default=0.5
Damping factor in the range `[0.5, 1.0)` is the extent to
which the current value is maintained relative to
incoming values (weighted 1 - damping). This in order
to avoid numerical oscillations when updating these
values (messages).
max_iter : int, default=200
Maximum number of iterations.
convergence_iter : int, default=15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.
copy : bool, default=True
Make a copy of input data.
preference : array-like of shape (n_samples,) or float, default=None
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number
of exemplars, ie of clusters, is influenced by the input
preferences value. If the preferences are not passed as arguments,
they will be set to the median of the input similarities.
affinity : {'euclidean', 'precomputed'}, default='euclidean'
Which affinity to use. At the moment 'precomputed' and
``euclidean`` are supported. 'euclidean' uses the
negative squared euclidean distance between points.
verbose : bool, default=False
Whether to be verbose.
random_state : int, RandomState instance or None, default=None
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.
.. versionadded:: 0.23
this parameter was previously hardcoded as 0.
Attributes
----------
cluster_centers_indices_ : ndarray of shape (n_clusters,)
Indices of cluster centers.
cluster_centers_ : ndarray of shape (n_clusters, n_features)
Cluster centers (if affinity != ``precomputed``).
labels_ : ndarray of shape (n_samples,)
Labels of each point.
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
Stores the affinity matrix used in ``fit``.
n_iter_ : int
Number of iterations taken to converge.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
AgglomerativeClustering : Recursively merges the pair of
clusters that minimally increases a given linkage distance.
FeatureAgglomeration : Similar to AgglomerativeClustering,
but recursively merges features instead of samples.
KMeans : K-Means clustering.
MiniBatchKMeans : Mini-Batch K-Means clustering.
MeanShift : Mean shift clustering using a flat kernel.
SpectralClustering : Apply clustering to a projection
of the normalized Laplacian.
Notes
-----
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
The algorithmic complexity of affinity propagation is quadratic
in the number of points.
When the algorithm does not converge, it will still return a arrays of
``cluster_center_indices`` and labels if there are any exemplars/clusters,
however they may be degenerate and should be used with caution.
When ``fit`` does not converge, ``cluster_centers_`` is still populated
however it may be degenerate. In such a case, proceed with caution.
If ``fit`` does not converge and fails to produce any ``cluster_centers_``
then ``predict`` will label every sample as ``-1``.
When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, ``fit`` will result in
a single cluster center and label ``0`` for every sample. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.
References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
Examples
--------
>>> from sklearn.cluster import AffinityPropagation
>>> import numpy as np
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [4, 2], [4, 4], [4, 0]])
>>> clustering = AffinityPropagation(random_state=5).fit(X)
>>> clustering
AffinityPropagation(random_state=5)
>>> clustering.labels_
array([0, 0, 0, 1, 1, 1])
>>> clustering.predict([[0, 0], [4, 4]])
array([0, 1])
>>> clustering.cluster_centers_
array([[1, 2],
[4, 2]])
"""
_parameter_constraints: dict = {
"damping": [Interval(Real, 0.5, 1.0, closed="left")],
"max_iter": [Interval(Integral, 1, None, closed="left")],
"convergence_iter": [Interval(Integral, 1, None, closed="left")],
"copy": ["boolean"],
"preference": [
"array-like",
Interval(Real, None, None, closed="neither"),
None,
],
"affinity": [StrOptions({"euclidean", "precomputed"})],
"verbose": ["verbose"],
"random_state": ["random_state"],
}
def __init__(
self,
*,
damping=0.5,
max_iter=200,
convergence_iter=15,
copy=True,
preference=None,
affinity="euclidean",
verbose=False,
random_state=None,
):
self.damping = damping
self.max_iter = max_iter
self.convergence_iter = convergence_iter
self.copy = copy
self.verbose = verbose
self.preference = preference
self.affinity = affinity
self.random_state = random_state
def _more_tags(self):
return {"pairwise": self.affinity == "precomputed"}
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the clustering from features, or affinity matrix.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
array-like of shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse feature matrix
is provided, it will be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Returns the instance itself.
"""
if self.affinity == "precomputed":
accept_sparse = False
else:
accept_sparse = "csr"
X = self._validate_data(X, accept_sparse=accept_sparse)
if self.affinity == "precomputed":
self.affinity_matrix_ = X.copy() if self.copy else X
else: # self.affinity == "euclidean"
self.affinity_matrix_ = -euclidean_distances(X, squared=True)
if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
raise ValueError(
"The matrix of similarities must be a square array. "
f"Got {self.affinity_matrix_.shape} instead."
)
if self.preference is None:
preference = np.median(self.affinity_matrix_)
else:
preference = self.preference
preference = np.asarray(preference)
random_state = check_random_state(self.random_state)
(
self.cluster_centers_indices_,
self.labels_,
self.n_iter_,
) = _affinity_propagation(
self.affinity_matrix_,
max_iter=self.max_iter,
convergence_iter=self.convergence_iter,
preference=preference,
damping=self.damping,
verbose=self.verbose,
return_n_iter=True,
random_state=random_state,
)
if self.affinity != "precomputed":
self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
return self
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
New data to predict. If a sparse matrix is provided, it will be
converted into a sparse ``csr_matrix``.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels.
"""
check_is_fitted(self)
X = self._validate_data(X, reset=False, accept_sparse="csr")
if not hasattr(self, "cluster_centers_"):
raise ValueError(
"Predict method is not supported when affinity='precomputed'."
)
if self.cluster_centers_.shape[0] > 0:
with config_context(assume_finite=True):
return pairwise_distances_argmin(X, self.cluster_centers_)
else:
warnings.warn(
(
"This model does not have any cluster centers "
"because affinity propagation did not converge. "
"Labeling every sample as '-1'."
),
ConvergenceWarning,
)
return np.array([-1] * X.shape[0])
def fit_predict(self, X, y=None):
"""Fit clustering from features/affinity matrix; return cluster labels.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
array-like of shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse feature matrix
is provided, it will be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels.
"""
return super().fit_predict(X, y)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,624 @@
"""Spectral biclustering algorithms."""
# Authors : Kemal Eren
# License: BSD 3 clause
from abc import ABCMeta, abstractmethod
from numbers import Integral
import numpy as np
from scipy.linalg import norm
from scipy.sparse import dia_matrix, issparse
from scipy.sparse.linalg import eigsh, svds
from ..base import BaseEstimator, BiclusterMixin, _fit_context
from ..utils import check_random_state, check_scalar
from ..utils._param_validation import Interval, StrOptions
from ..utils.extmath import make_nonnegative, randomized_svd, safe_sparse_dot
from ..utils.validation import assert_all_finite
from ._kmeans import KMeans, MiniBatchKMeans
__all__ = ["SpectralCoclustering", "SpectralBiclustering"]
def _scale_normalize(X):
"""Normalize ``X`` by scaling rows and columns independently.
Returns the normalized matrix and the row and column scaling
factors.
"""
X = make_nonnegative(X)
row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
row_diag = np.where(np.isnan(row_diag), 0, row_diag)
col_diag = np.where(np.isnan(col_diag), 0, col_diag)
if issparse(X):
n_rows, n_cols = X.shape
r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
an = r * X * c
else:
an = row_diag[:, np.newaxis] * X * col_diag
return an, row_diag, col_diag
def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
"""Normalize rows and columns of ``X`` simultaneously so that all
rows sum to one constant and all columns sum to a different
constant.
"""
# According to paper, this can also be done more efficiently with
# deviation reduction and balancing algorithms.
X = make_nonnegative(X)
X_scaled = X
for _ in range(max_iter):
X_new, _, _ = _scale_normalize(X_scaled)
if issparse(X):
dist = norm(X_scaled.data - X.data)
else:
dist = norm(X_scaled - X_new)
X_scaled = X_new
if dist is not None and dist < tol:
break
return X_scaled
def _log_normalize(X):
"""Normalize ``X`` according to Kluger's log-interactions scheme."""
X = make_nonnegative(X, min_value=1)
if issparse(X):
raise ValueError(
"Cannot compute log of a sparse matrix,"
" because log(x) diverges to -infinity as x"
" goes to 0."
)
L = np.log(X)
row_avg = L.mean(axis=1)[:, np.newaxis]
col_avg = L.mean(axis=0)
avg = L.mean()
return L - row_avg - col_avg + avg
class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
"""Base class for spectral biclustering."""
_parameter_constraints: dict = {
"svd_method": [StrOptions({"randomized", "arpack"})],
"n_svd_vecs": [Interval(Integral, 0, None, closed="left"), None],
"mini_batch": ["boolean"],
"init": [StrOptions({"k-means++", "random"}), np.ndarray],
"n_init": [Interval(Integral, 1, None, closed="left")],
"random_state": ["random_state"],
}
@abstractmethod
def __init__(
self,
n_clusters=3,
svd_method="randomized",
n_svd_vecs=None,
mini_batch=False,
init="k-means++",
n_init=10,
random_state=None,
):
self.n_clusters = n_clusters
self.svd_method = svd_method
self.n_svd_vecs = n_svd_vecs
self.mini_batch = mini_batch
self.init = init
self.n_init = n_init
self.random_state = random_state
@abstractmethod
def _check_parameters(self, n_samples):
"""Validate parameters depending on the input data."""
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Create a biclustering for X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
SpectralBiclustering instance.
"""
X = self._validate_data(X, accept_sparse="csr", dtype=np.float64)
self._check_parameters(X.shape[0])
self._fit(X)
return self
def _svd(self, array, n_components, n_discard):
"""Returns first `n_components` left and right singular
vectors u and v, discarding the first `n_discard`.
"""
if self.svd_method == "randomized":
kwargs = {}
if self.n_svd_vecs is not None:
kwargs["n_oversamples"] = self.n_svd_vecs
u, _, vt = randomized_svd(
array, n_components, random_state=self.random_state, **kwargs
)
elif self.svd_method == "arpack":
u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
if np.any(np.isnan(vt)):
# some eigenvalues of A * A.T are negative, causing
# sqrt() to be np.nan. This causes some vectors in vt
# to be np.nan.
A = safe_sparse_dot(array.T, array)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
vt = v.T
if np.any(np.isnan(u)):
A = safe_sparse_dot(array, array.T)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
assert_all_finite(u)
assert_all_finite(vt)
u = u[:, n_discard:]
vt = vt[n_discard:]
return u, vt.T
def _k_means(self, data, n_clusters):
if self.mini_batch:
model = MiniBatchKMeans(
n_clusters,
init=self.init,
n_init=self.n_init,
random_state=self.random_state,
)
else:
model = KMeans(
n_clusters,
init=self.init,
n_init=self.n_init,
random_state=self.random_state,
)
model.fit(data)
centroid = model.cluster_centers_
labels = model.labels_
return centroid, labels
def _more_tags(self):
return {
"_xfail_checks": {
"check_estimators_dtypes": "raises nan error",
"check_fit2d_1sample": "_scale_normalize fails",
"check_fit2d_1feature": "raises apply_along_axis error",
"check_estimator_sparse_matrix": "does not fail gracefully",
"check_estimator_sparse_array": "does not fail gracefully",
"check_methods_subset_invariance": "empty array passed inside",
"check_dont_overwrite_parameters": "empty array passed inside",
"check_fit2d_predict1d": "empty array passed inside",
}
}
class SpectralCoclustering(BaseSpectral):
"""Spectral Co-Clustering algorithm (Dhillon, 2001).
Clusters rows and columns of an array `X` to solve the relaxed
normalized cut of the bipartite graph created from `X` as follows:
the edge between row vertex `i` and column vertex `j` has weight
`X[i, j]`.
The resulting bicluster structure is block-diagonal, since each
row and each column belongs to exactly one bicluster.
Supports sparse matrices, as long as they are nonnegative.
Read more in the :ref:`User Guide <spectral_coclustering>`.
Parameters
----------
n_clusters : int, default=3
The number of biclusters to find.
svd_method : {'randomized', 'arpack'}, default='randomized'
Selects the algorithm for finding singular vectors. May be
'randomized' or 'arpack'. If 'randomized', use
:func:`sklearn.utils.extmath.randomized_svd`, which may be faster
for large matrices. If 'arpack', use
:func:`scipy.sparse.linalg.svds`, which is more accurate, but
possibly slower in some cases.
n_svd_vecs : int, default=None
Number of vectors to use in calculating the SVD. Corresponds
to `ncv` when `svd_method=arpack` and `n_oversamples` when
`svd_method` is 'randomized`.
mini_batch : bool, default=False
Whether to use mini-batch k-means, which is faster but may get
different results.
init : {'k-means++', 'random'}, or ndarray of shape \
(n_clusters, n_features), default='k-means++'
Method for initialization of k-means algorithm; defaults to
'k-means++'.
n_init : int, default=10
Number of random initializations that are tried with the
k-means algorithm.
If mini-batch k-means is used, the best initialization is
chosen and the algorithm runs once. Otherwise, the algorithm
is run for each initialization and the best solution chosen.
random_state : int, RandomState instance, default=None
Used for randomizing the singular value decomposition and the k-means
initialization. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Attributes
----------
rows_ : array-like of shape (n_row_clusters, n_rows)
Results of the clustering. `rows[i, r]` is True if
cluster `i` contains row `r`. Available only after calling ``fit``.
columns_ : array-like of shape (n_column_clusters, n_columns)
Results of the clustering, like `rows`.
row_labels_ : array-like of shape (n_rows,)
The bicluster label of each row.
column_labels_ : array-like of shape (n_cols,)
The bicluster label of each column.
biclusters_ : tuple of two ndarrays
The tuple contains the `rows_` and `columns_` arrays.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
SpectralBiclustering : Partitions rows and columns under the assumption
that the data has an underlying checkerboard structure.
References
----------
* :doi:`Dhillon, Inderjit S, 2001. Co-clustering documents and words using
bipartite spectral graph partitioning.
<10.1145/502512.502550>`
Examples
--------
>>> from sklearn.cluster import SpectralCoclustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)
>>> clustering.row_labels_ #doctest: +SKIP
array([0, 1, 1, 0, 0, 0], dtype=int32)
>>> clustering.column_labels_ #doctest: +SKIP
array([0, 0], dtype=int32)
>>> clustering
SpectralCoclustering(n_clusters=2, random_state=0)
"""
_parameter_constraints: dict = {
**BaseSpectral._parameter_constraints,
"n_clusters": [Interval(Integral, 1, None, closed="left")],
}
def __init__(
self,
n_clusters=3,
*,
svd_method="randomized",
n_svd_vecs=None,
mini_batch=False,
init="k-means++",
n_init=10,
random_state=None,
):
super().__init__(
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
)
def _check_parameters(self, n_samples):
if self.n_clusters > n_samples:
raise ValueError(
f"n_clusters should be <= n_samples={n_samples}. Got"
f" {self.n_clusters} instead."
)
def _fit(self, X):
normalized_data, row_diag, col_diag = _scale_normalize(X)
n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
u, v = self._svd(normalized_data, n_sv, n_discard=1)
z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))
_, labels = self._k_means(z, self.n_clusters)
n_rows = X.shape[0]
self.row_labels_ = labels[:n_rows]
self.column_labels_ = labels[n_rows:]
self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])
self.columns_ = np.vstack(
[self.column_labels_ == c for c in range(self.n_clusters)]
)
class SpectralBiclustering(BaseSpectral):
"""Spectral biclustering (Kluger, 2003).
Partitions rows and columns under the assumption that the data has
an underlying checkerboard structure. For instance, if there are
two row partitions and three column partitions, each row will
belong to three biclusters, and each column will belong to two
biclusters. The outer product of the corresponding row and column
label vectors gives this checkerboard structure.
Read more in the :ref:`User Guide <spectral_biclustering>`.
Parameters
----------
n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
The number of row and column clusters in the checkerboard
structure.
method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
Method of normalizing and converting singular vectors into
biclusters. May be one of 'scale', 'bistochastic', or 'log'.
The authors recommend using 'log'. If the data is sparse,
however, log normalization will not work, which is why the
default is 'bistochastic'.
.. warning::
if `method='log'`, the data must not be sparse.
n_components : int, default=6
Number of singular vectors to check.
n_best : int, default=3
Number of best singular vectors to which to project the data
for clustering.
svd_method : {'randomized', 'arpack'}, default='randomized'
Selects the algorithm for finding singular vectors. May be
'randomized' or 'arpack'. If 'randomized', uses
:func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
for large matrices. If 'arpack', uses
`scipy.sparse.linalg.svds`, which is more accurate, but
possibly slower in some cases.
n_svd_vecs : int, default=None
Number of vectors to use in calculating the SVD. Corresponds
to `ncv` when `svd_method=arpack` and `n_oversamples` when
`svd_method` is 'randomized`.
mini_batch : bool, default=False
Whether to use mini-batch k-means, which is faster but may get
different results.
init : {'k-means++', 'random'} or ndarray of shape (n_clusters, n_features), \
default='k-means++'
Method for initialization of k-means algorithm; defaults to
'k-means++'.
n_init : int, default=10
Number of random initializations that are tried with the
k-means algorithm.
If mini-batch k-means is used, the best initialization is
chosen and the algorithm runs once. Otherwise, the algorithm
is run for each initialization and the best solution chosen.
random_state : int, RandomState instance, default=None
Used for randomizing the singular value decomposition and the k-means
initialization. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Attributes
----------
rows_ : array-like of shape (n_row_clusters, n_rows)
Results of the clustering. `rows[i, r]` is True if
cluster `i` contains row `r`. Available only after calling ``fit``.
columns_ : array-like of shape (n_column_clusters, n_columns)
Results of the clustering, like `rows`.
row_labels_ : array-like of shape (n_rows,)
Row partition labels.
column_labels_ : array-like of shape (n_cols,)
Column partition labels.
biclusters_ : tuple of two ndarrays
The tuple contains the `rows_` and `columns_` arrays.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
SpectralCoclustering : Spectral Co-Clustering algorithm (Dhillon, 2001).
References
----------
* :doi:`Kluger, Yuval, et. al., 2003. Spectral biclustering of microarray
data: coclustering genes and conditions.
<10.1101/gr.648603>`
Examples
--------
>>> from sklearn.cluster import SpectralBiclustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)
>>> clustering.row_labels_
array([1, 1, 1, 0, 0, 0], dtype=int32)
>>> clustering.column_labels_
array([1, 0], dtype=int32)
>>> clustering
SpectralBiclustering(n_clusters=2, random_state=0)
"""
_parameter_constraints: dict = {
**BaseSpectral._parameter_constraints,
"n_clusters": [Interval(Integral, 1, None, closed="left"), tuple],
"method": [StrOptions({"bistochastic", "scale", "log"})],
"n_components": [Interval(Integral, 1, None, closed="left")],
"n_best": [Interval(Integral, 1, None, closed="left")],
}
def __init__(
self,
n_clusters=3,
*,
method="bistochastic",
n_components=6,
n_best=3,
svd_method="randomized",
n_svd_vecs=None,
mini_batch=False,
init="k-means++",
n_init=10,
random_state=None,
):
super().__init__(
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
)
self.method = method
self.n_components = n_components
self.n_best = n_best
def _check_parameters(self, n_samples):
if isinstance(self.n_clusters, Integral):
if self.n_clusters > n_samples:
raise ValueError(
f"n_clusters should be <= n_samples={n_samples}. Got"
f" {self.n_clusters} instead."
)
else: # tuple
try:
n_row_clusters, n_column_clusters = self.n_clusters
check_scalar(
n_row_clusters,
"n_row_clusters",
target_type=Integral,
min_val=1,
max_val=n_samples,
)
check_scalar(
n_column_clusters,
"n_column_clusters",
target_type=Integral,
min_val=1,
max_val=n_samples,
)
except (ValueError, TypeError) as e:
raise ValueError(
"Incorrect parameter n_clusters has value:"
f" {self.n_clusters}. It should either be a single integer"
" or an iterable with two integers:"
" (n_row_clusters, n_column_clusters)"
" And the values are should be in the"
" range: (1, n_samples)"
) from e
if self.n_best > self.n_components:
raise ValueError(
f"n_best={self.n_best} must be <= n_components={self.n_components}."
)
def _fit(self, X):
n_sv = self.n_components
if self.method == "bistochastic":
normalized_data = _bistochastic_normalize(X)
n_sv += 1
elif self.method == "scale":
normalized_data, _, _ = _scale_normalize(X)
n_sv += 1
elif self.method == "log":
normalized_data = _log_normalize(X)
n_discard = 0 if self.method == "log" else 1
u, v = self._svd(normalized_data, n_sv, n_discard)
ut = u.T
vt = v.T
try:
n_row_clusters, n_col_clusters = self.n_clusters
except TypeError:
n_row_clusters = n_col_clusters = self.n_clusters
best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters)
best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters)
self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters)
self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters)
self.rows_ = np.vstack(
[
self.row_labels_ == label
for label in range(n_row_clusters)
for _ in range(n_col_clusters)
]
)
self.columns_ = np.vstack(
[
self.column_labels_ == label
for _ in range(n_row_clusters)
for label in range(n_col_clusters)
]
)
def _fit_best_piecewise(self, vectors, n_best, n_clusters):
"""Find the ``n_best`` vectors that are best approximated by piecewise
constant vectors.
The piecewise vectors are found by k-means; the best is chosen
according to Euclidean distance.
"""
def make_piecewise(v):
centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters)
return centroid[labels].ravel()
piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors)
dists = np.apply_along_axis(norm, axis=1, arr=(vectors - piecewise_vectors))
result = vectors[np.argsort(dists)[:n_best]]
return result
def _project_and_cluster(self, data, vectors, n_clusters):
"""Project ``data`` to ``vectors`` and cluster the result."""
projected = safe_sparse_dot(data, vectors)
_, labels = self._k_means(projected, n_clusters)
return labels
@@ -0,0 +1,741 @@
# Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# Joel Nothman <joel.nothman@gmail.com>
# License: BSD 3 clause
import warnings
from math import sqrt
from numbers import Integral, Real
import numpy as np
from scipy import sparse
from .._config import config_context
from ..base import (
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
ClusterMixin,
TransformerMixin,
_fit_context,
)
from ..exceptions import ConvergenceWarning
from ..metrics import pairwise_distances_argmin
from ..metrics.pairwise import euclidean_distances
from ..utils._param_validation import Interval
from ..utils.extmath import row_norms
from ..utils.validation import check_is_fitted
from . import AgglomerativeClustering
def _iterate_sparse_X(X):
"""This little hack returns a densified row when iterating over a sparse
matrix, instead of constructing a sparse matrix for every row that is
expensive.
"""
n_samples = X.shape[0]
X_indices = X.indices
X_data = X.data
X_indptr = X.indptr
for i in range(n_samples):
row = np.zeros(X.shape[1])
startptr, endptr = X_indptr[i], X_indptr[i + 1]
nonzero_indices = X_indices[startptr:endptr]
row[nonzero_indices] = X_data[startptr:endptr]
yield row
def _split_node(node, threshold, branching_factor):
"""The node has to be split if there is no place for a new subcluster
in the node.
1. Two empty nodes and two empty subclusters are initialized.
2. The pair of distant subclusters are found.
3. The properties of the empty subclusters and nodes are updated
according to the nearest distance between the subclusters to the
pair of distant subclusters.
4. The two nodes are set as children to the two subclusters.
"""
new_subcluster1 = _CFSubcluster()
new_subcluster2 = _CFSubcluster()
new_node1 = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=node.is_leaf,
n_features=node.n_features,
dtype=node.init_centroids_.dtype,
)
new_node2 = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=node.is_leaf,
n_features=node.n_features,
dtype=node.init_centroids_.dtype,
)
new_subcluster1.child_ = new_node1
new_subcluster2.child_ = new_node2
if node.is_leaf:
if node.prev_leaf_ is not None:
node.prev_leaf_.next_leaf_ = new_node1
new_node1.prev_leaf_ = node.prev_leaf_
new_node1.next_leaf_ = new_node2
new_node2.prev_leaf_ = new_node1
new_node2.next_leaf_ = node.next_leaf_
if node.next_leaf_ is not None:
node.next_leaf_.prev_leaf_ = new_node2
dist = euclidean_distances(
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True
)
n_clusters = dist.shape[0]
farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))
node1_dist, node2_dist = dist[(farthest_idx,)]
node1_closer = node1_dist < node2_dist
# make sure node1 is closest to itself even if all distances are equal.
# This can only happen when all node.centroids_ are duplicates leading to all
# distances between centroids being zero.
node1_closer[farthest_idx[0]] = True
for idx, subcluster in enumerate(node.subclusters_):
if node1_closer[idx]:
new_node1.append_subcluster(subcluster)
new_subcluster1.update(subcluster)
else:
new_node2.append_subcluster(subcluster)
new_subcluster2.update(subcluster)
return new_subcluster1, new_subcluster2
class _CFNode:
"""Each node in a CFTree is called a CFNode.
The CFNode can have a maximum of branching_factor
number of CFSubclusters.
Parameters
----------
threshold : float
Threshold needed for a new subcluster to enter a CFSubcluster.
branching_factor : int
Maximum number of CF subclusters in each node.
is_leaf : bool
We need to know if the CFNode is a leaf or not, in order to
retrieve the final subclusters.
n_features : int
The number of features.
Attributes
----------
subclusters_ : list
List of subclusters for a particular CFNode.
prev_leaf_ : _CFNode
Useful only if is_leaf is True.
next_leaf_ : _CFNode
next_leaf. Useful only if is_leaf is True.
the final subclusters.
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
Manipulate ``init_centroids_`` throughout rather than centroids_ since
the centroids are just a view of the ``init_centroids_`` .
init_sq_norm_ : ndarray of shape (branching_factor + 1,)
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
centroids_ : ndarray of shape (branching_factor + 1, n_features)
View of ``init_centroids_``.
squared_norm_ : ndarray of shape (branching_factor + 1,)
View of ``init_sq_norm_``.
"""
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
self.threshold = threshold
self.branching_factor = branching_factor
self.is_leaf = is_leaf
self.n_features = n_features
# The list of subclusters, centroids and squared norms
# to manipulate throughout.
self.subclusters_ = []
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype)
self.squared_norm_ = []
self.prev_leaf_ = None
self.next_leaf_ = None
def append_subcluster(self, subcluster):
n_samples = len(self.subclusters_)
self.subclusters_.append(subcluster)
self.init_centroids_[n_samples] = subcluster.centroid_
self.init_sq_norm_[n_samples] = subcluster.sq_norm_
# Keep centroids and squared norm as views. In this way
# if we change init_centroids and init_sq_norm_, it is
# sufficient,
self.centroids_ = self.init_centroids_[: n_samples + 1, :]
self.squared_norm_ = self.init_sq_norm_[: n_samples + 1]
def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
"""Remove a subcluster from a node and update it with the
split subclusters.
"""
ind = self.subclusters_.index(subcluster)
self.subclusters_[ind] = new_subcluster1
self.init_centroids_[ind] = new_subcluster1.centroid_
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
self.append_subcluster(new_subcluster2)
def insert_cf_subcluster(self, subcluster):
"""Insert a new subcluster into the node."""
if not self.subclusters_:
self.append_subcluster(subcluster)
return False
threshold = self.threshold
branching_factor = self.branching_factor
# We need to find the closest subcluster among all the
# subclusters so that we can insert our new subcluster.
dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
dist_matrix *= -2.0
dist_matrix += self.squared_norm_
closest_index = np.argmin(dist_matrix)
closest_subcluster = self.subclusters_[closest_index]
# If the subcluster has a child, we need a recursive strategy.
if closest_subcluster.child_ is not None:
split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)
if not split_child:
# If it is determined that the child need not be split, we
# can just update the closest_subcluster
closest_subcluster.update(subcluster)
self.init_centroids_[closest_index] = self.subclusters_[
closest_index
].centroid_
self.init_sq_norm_[closest_index] = self.subclusters_[
closest_index
].sq_norm_
return False
# things not too good. we need to redistribute the subclusters in
# our child node, and add a new subcluster in the parent
# subcluster to accommodate the new child.
else:
new_subcluster1, new_subcluster2 = _split_node(
closest_subcluster.child_,
threshold,
branching_factor,
)
self.update_split_subclusters(
closest_subcluster, new_subcluster1, new_subcluster2
)
if len(self.subclusters_) > self.branching_factor:
return True
return False
# good to go!
else:
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
if merged:
self.init_centroids_[closest_index] = closest_subcluster.centroid_
self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_
return False
# not close to any other subclusters, and we still
# have space, so add.
elif len(self.subclusters_) < self.branching_factor:
self.append_subcluster(subcluster)
return False
# We do not have enough space nor is it closer to an
# other subcluster. We need to split.
else:
self.append_subcluster(subcluster)
return True
class _CFSubcluster:
"""Each subcluster in a CFNode is called a CFSubcluster.
A CFSubcluster can have a CFNode has its child.
Parameters
----------
linear_sum : ndarray of shape (n_features,), default=None
Sample. This is kept optional to allow initialization of empty
subclusters.
Attributes
----------
n_samples_ : int
Number of samples that belong to each subcluster.
linear_sum_ : ndarray
Linear sum of all the samples in a subcluster. Prevents holding
all sample data in memory.
squared_sum_ : float
Sum of the squared l2 norms of all samples belonging to a subcluster.
centroid_ : ndarray of shape (branching_factor + 1, n_features)
Centroid of the subcluster. Prevent recomputing of centroids when
``CFNode.centroids_`` is called.
child_ : _CFNode
Child Node of the subcluster. Once a given _CFNode is set as the child
of the _CFNode, it is set to ``self.child_``.
sq_norm_ : ndarray of shape (branching_factor + 1,)
Squared norm of the subcluster. Used to prevent recomputing when
pairwise minimum distances are computed.
"""
def __init__(self, *, linear_sum=None):
if linear_sum is None:
self.n_samples_ = 0
self.squared_sum_ = 0.0
self.centroid_ = self.linear_sum_ = 0
else:
self.n_samples_ = 1
self.centroid_ = self.linear_sum_ = linear_sum
self.squared_sum_ = self.sq_norm_ = np.dot(
self.linear_sum_, self.linear_sum_
)
self.child_ = None
def update(self, subcluster):
self.n_samples_ += subcluster.n_samples_
self.linear_sum_ += subcluster.linear_sum_
self.squared_sum_ += subcluster.squared_sum_
self.centroid_ = self.linear_sum_ / self.n_samples_
self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
def merge_subcluster(self, nominee_cluster, threshold):
"""Check if a cluster is worthy enough to be merged. If
yes then merge.
"""
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
new_n = self.n_samples_ + nominee_cluster.n_samples_
new_centroid = (1 / new_n) * new_ls
new_sq_norm = np.dot(new_centroid, new_centroid)
# The squared radius of the cluster is defined:
# r^2 = sum_i ||x_i - c||^2 / n
# with x_i the n points assigned to the cluster and c its centroid:
# c = sum_i x_i / n
# This can be expanded to:
# r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
# and therefore simplifies to:
# r^2 = sum_i ||x_i||^2 / n - ||c||^2
sq_radius = new_ss / new_n - new_sq_norm
if sq_radius <= threshold**2:
(
self.n_samples_,
self.linear_sum_,
self.squared_sum_,
self.centroid_,
self.sq_norm_,
) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)
return True
return False
@property
def radius(self):
"""Return radius of the subcluster"""
# Because of numerical issues, this could become negative
sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
return sqrt(max(0, sq_radius))
class Birch(
ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
):
"""Implements the BIRCH clustering algorithm.
It is a memory-efficient, online-learning algorithm provided as an
alternative to :class:`MiniBatchKMeans`. It constructs a tree
data structure with the cluster centroids being read off the leaf.
These can be either the final cluster centroids or can be provided as input
to another clustering algorithm such as :class:`AgglomerativeClustering`.
Read more in the :ref:`User Guide <birch>`.
.. versionadded:: 0.16
Parameters
----------
threshold : float, default=0.5
The radius of the subcluster obtained by merging a new sample and the
closest subcluster should be lesser than the threshold. Otherwise a new
subcluster is started. Setting this value to be very low promotes
splitting and vice-versa.
branching_factor : int, default=50
Maximum number of CF subclusters in each node. If a new samples enters
such that the number of subclusters exceed the branching_factor then
that node is split into two nodes with the subclusters redistributed
in each. The parent subcluster of that node is removed and two new
subclusters are added as parents of the 2 split nodes.
n_clusters : int, instance of sklearn.cluster model or None, default=3
Number of clusters after the final clustering step, which treats the
subclusters from the leaves as new samples.
- `None` : the final clustering step is not performed and the
subclusters are returned as they are.
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model
is fit treating the subclusters as new samples and the initial data
is mapped to the label of the closest subcluster.
- `int` : the model fit is :class:`AgglomerativeClustering` with
`n_clusters` set to be equal to the int.
compute_labels : bool, default=True
Whether or not to compute labels for each fit.
copy : bool, default=True
Whether or not to make a copy of the given data. If set to False,
the initial data will be overwritten.
Attributes
----------
root_ : _CFNode
Root of the CFTree.
dummy_leaf_ : _CFNode
Start pointer to all the leaves.
subcluster_centers_ : ndarray
Centroids of all subclusters read directly from the leaves.
subcluster_labels_ : ndarray
Labels assigned to the centroids of the subclusters after
they are clustered globally.
labels_ : ndarray of shape (n_samples,)
Array of labels assigned to the input data.
if partial_fit is used instead of fit, they are assigned to the
last batch of data.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
MiniBatchKMeans : Alternative implementation that does incremental updates
of the centers' positions using mini-batches.
Notes
-----
The tree data structure consists of nodes with each node consisting of
a number of subclusters. The maximum number of subclusters in a node
is determined by the branching factor. Each subcluster maintains a
linear sum, squared sum and the number of samples in that subcluster.
In addition, each subcluster can also have a node as its child, if the
subcluster is not a member of a leaf node.
For a new point entering the root, it is merged with the subcluster closest
to it and the linear sum, squared sum and the number of samples of that
subcluster are updated. This is done recursively till the properties of
the leaf node are updated.
References
----------
* Tian Zhang, Raghu Ramakrishnan, Maron Livny
BIRCH: An efficient data clustering method for large databases.
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
* Roberto Perdisci
JBirch - Java implementation of BIRCH clustering algorithm
https://code.google.com/archive/p/jbirch
Examples
--------
>>> from sklearn.cluster import Birch
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
>>> brc = Birch(n_clusters=None)
>>> brc.fit(X)
Birch(n_clusters=None)
>>> brc.predict(X)
array([0, 0, 0, 1, 1, 1])
"""
_parameter_constraints: dict = {
"threshold": [Interval(Real, 0.0, None, closed="neither")],
"branching_factor": [Interval(Integral, 1, None, closed="neither")],
"n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")],
"compute_labels": ["boolean"],
"copy": ["boolean"],
}
def __init__(
self,
*,
threshold=0.5,
branching_factor=50,
n_clusters=3,
compute_labels=True,
copy=True,
):
self.threshold = threshold
self.branching_factor = branching_factor
self.n_clusters = n_clusters
self.compute_labels = compute_labels
self.copy = copy
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""
Build a CF Tree for the input data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Fitted estimator.
"""
return self._fit(X, partial=False)
def _fit(self, X, partial):
has_root = getattr(self, "root_", None)
first_call = not (partial and has_root)
X = self._validate_data(
X,
accept_sparse="csr",
copy=self.copy,
reset=first_call,
dtype=[np.float64, np.float32],
)
threshold = self.threshold
branching_factor = self.branching_factor
n_samples, n_features = X.shape
# If partial_fit is called for the first time or fit is called, we
# start a new tree.
if first_call:
# The first root is the leaf. Manipulate this object throughout.
self.root_ = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=True,
n_features=n_features,
dtype=X.dtype,
)
# To enable getting back subclusters.
self.dummy_leaf_ = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=True,
n_features=n_features,
dtype=X.dtype,
)
self.dummy_leaf_.next_leaf_ = self.root_
self.root_.prev_leaf_ = self.dummy_leaf_
# Cannot vectorize. Enough to convince to use cython.
if not sparse.issparse(X):
iter_func = iter
else:
iter_func = _iterate_sparse_X
for sample in iter_func(X):
subcluster = _CFSubcluster(linear_sum=sample)
split = self.root_.insert_cf_subcluster(subcluster)
if split:
new_subcluster1, new_subcluster2 = _split_node(
self.root_, threshold, branching_factor
)
del self.root_
self.root_ = _CFNode(
threshold=threshold,
branching_factor=branching_factor,
is_leaf=False,
n_features=n_features,
dtype=X.dtype,
)
self.root_.append_subcluster(new_subcluster1)
self.root_.append_subcluster(new_subcluster2)
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
self.subcluster_centers_ = centroids
self._n_features_out = self.subcluster_centers_.shape[0]
self._global_clustering(X)
return self
def _get_leaves(self):
"""
Retrieve the leaves of the CF Node.
Returns
-------
leaves : list of shape (n_leaves,)
List of the leaf nodes.
"""
leaf_ptr = self.dummy_leaf_.next_leaf_
leaves = []
while leaf_ptr is not None:
leaves.append(leaf_ptr)
leaf_ptr = leaf_ptr.next_leaf_
return leaves
@_fit_context(prefer_skip_nested_validation=True)
def partial_fit(self, X=None, y=None):
"""
Online learning. Prevents rebuilding of CFTree from scratch.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), \
default=None
Input data. If X is not provided, only the global clustering
step is done.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Fitted estimator.
"""
if X is None:
# Perform just the final global clustering step.
self._global_clustering()
return self
else:
return self._fit(X, partial=True)
def _check_fit(self, X):
check_is_fitted(self)
if (
hasattr(self, "subcluster_centers_")
and X.shape[1] != self.subcluster_centers_.shape[1]
):
raise ValueError(
"Training data and predicted data do not have same number of features."
)
def predict(self, X):
"""
Predict data using the ``centroids_`` of subclusters.
Avoid computation of the row norms of X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
Returns
-------
labels : ndarray of shape(n_samples,)
Labelled data.
"""
check_is_fitted(self)
X = self._validate_data(X, accept_sparse="csr", reset=False)
return self._predict(X)
def _predict(self, X):
"""Predict data using the ``centroids_`` of subclusters."""
kwargs = {"Y_norm_squared": self._subcluster_norms}
with config_context(assume_finite=True):
argmin = pairwise_distances_argmin(
X, self.subcluster_centers_, metric_kwargs=kwargs
)
return self.subcluster_labels_[argmin]
def transform(self, X):
"""
Transform X into subcluster centroids dimension.
Each dimension represents the distance from the sample point to each
cluster centroid.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
Returns
-------
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
Transformed data.
"""
check_is_fitted(self)
X = self._validate_data(X, accept_sparse="csr", reset=False)
with config_context(assume_finite=True):
return euclidean_distances(X, self.subcluster_centers_)
def _global_clustering(self, X=None):
"""
Global clustering for the subclusters obtained after fitting
"""
clusterer = self.n_clusters
centroids = self.subcluster_centers_
compute_labels = (X is not None) and self.compute_labels
# Preprocessing for the global clustering.
not_enough_centroids = False
if isinstance(clusterer, Integral):
clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
# There is no need to perform the global clustering step.
if len(centroids) < self.n_clusters:
not_enough_centroids = True
# To use in predict to avoid recalculation.
self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
if clusterer is None or not_enough_centroids:
self.subcluster_labels_ = np.arange(len(centroids))
if not_enough_centroids:
warnings.warn(
"Number of subclusters found (%d) by BIRCH is less "
"than (%d). Decrease the threshold."
% (len(centroids), self.n_clusters),
ConvergenceWarning,
)
else:
# The global clustering step that clusters the subclusters of
# the leaves. It assumes the centroids of the subclusters as
# samples and finds the final centroids.
self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
if compute_labels:
self.labels_ = self._predict(X)
def _more_tags(self):
return {"preserves_dtype": [np.float64, np.float32]}
@@ -0,0 +1,530 @@
"""Bisecting K-means clustering."""
# Author: Michal Krawczyk <mkrwczyk.1@gmail.com>
import warnings
import numpy as np
import scipy.sparse as sp
from ..base import _fit_context
from ..utils._openmp_helpers import _openmp_effective_n_threads
from ..utils._param_validation import Integral, Interval, StrOptions
from ..utils.extmath import row_norms
from ..utils.validation import _check_sample_weight, check_is_fitted, check_random_state
from ._k_means_common import _inertia_dense, _inertia_sparse
from ._kmeans import (
_BaseKMeans,
_kmeans_single_elkan,
_kmeans_single_lloyd,
_labels_inertia_threadpool_limit,
)
class _BisectingTree:
"""Tree structure representing the hierarchical clusters of BisectingKMeans."""
def __init__(self, center, indices, score):
"""Create a new cluster node in the tree.
The node holds the center of this cluster and the indices of the data points
that belong to it.
"""
self.center = center
self.indices = indices
self.score = score
self.left = None
self.right = None
def split(self, labels, centers, scores):
"""Split the cluster node into two subclusters."""
self.left = _BisectingTree(
indices=self.indices[labels == 0], center=centers[0], score=scores[0]
)
self.right = _BisectingTree(
indices=self.indices[labels == 1], center=centers[1], score=scores[1]
)
# reset the indices attribute to save memory
self.indices = None
def get_cluster_to_bisect(self):
"""Return the cluster node to bisect next.
It's based on the score of the cluster, which can be either the number of
data points assigned to that cluster or the inertia of that cluster
(see `bisecting_strategy` for details).
"""
max_score = None
for cluster_leaf in self.iter_leaves():
if max_score is None or cluster_leaf.score > max_score:
max_score = cluster_leaf.score
best_cluster_leaf = cluster_leaf
return best_cluster_leaf
def iter_leaves(self):
"""Iterate over all the cluster leaves in the tree."""
if self.left is None:
yield self
else:
yield from self.left.iter_leaves()
yield from self.right.iter_leaves()
class BisectingKMeans(_BaseKMeans):
"""Bisecting K-Means clustering.
Read more in the :ref:`User Guide <bisect_k_means>`.
.. versionadded:: 1.1
Parameters
----------
n_clusters : int, default=8
The number of clusters to form as well as the number of
centroids to generate.
init : {'k-means++', 'random'} or callable, default='random'
Method for initialization:
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': choose `n_clusters` observations (rows) at random from data
for the initial centroids.
If a callable is passed, it should take arguments X, n_clusters and a
random state and return an initialization.
n_init : int, default=1
Number of time the inner k-means algorithm will be run with different
centroid seeds in each bisection.
That will result producing for each bisection best output of n_init
consecutive runs in terms of inertia.
random_state : int, RandomState instance or None, default=None
Determines random number generation for centroid initialization
in inner K-Means. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
max_iter : int, default=300
Maximum number of iterations of the inner k-means algorithm at each
bisection.
verbose : int, default=0
Verbosity mode.
tol : float, default=1e-4
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence. Used in inner k-means algorithm at each bisection to pick
best possible clusters.
copy_x : bool, default=True
When pre-computing distances it is more numerically accurate to center
the data first. If copy_x is True (default), then the original data is
not modified. If False, the original data is modified, and put back
before the function returns, but small numerical differences may be
introduced by subtracting and then adding the data mean. Note that if
the original data is not C-contiguous, a copy will be made even if
copy_x is False. If the original data is sparse, but not in CSR format,
a copy will be made even if copy_x is False.
algorithm : {"lloyd", "elkan"}, default="lloyd"
Inner K-means algorithm used in bisection.
The classical EM-style algorithm is `"lloyd"`.
The `"elkan"` variation can be more efficient on some datasets with
well-defined clusters, by using the triangle inequality. However it's
more memory intensive due to the allocation of an extra array of shape
`(n_samples, n_clusters)`.
bisecting_strategy : {"biggest_inertia", "largest_cluster"},\
default="biggest_inertia"
Defines how bisection should be performed:
- "biggest_inertia" means that BisectingKMeans will always check
all calculated cluster for cluster with biggest SSE
(Sum of squared errors) and bisect it. This approach concentrates on
precision, but may be costly in terms of execution time (especially for
larger amount of data points).
- "largest_cluster" - BisectingKMeans will always split cluster with
largest amount of points assigned to it from all clusters
previously calculated. That should work faster than picking by SSE
('biggest_inertia') and may produce similar results in most cases.
Attributes
----------
cluster_centers_ : ndarray of shape (n_clusters, n_features)
Coordinates of cluster centers. If the algorithm stops before fully
converging (see ``tol`` and ``max_iter``), these will not be
consistent with ``labels_``.
labels_ : ndarray of shape (n_samples,)
Labels of each point.
inertia_ : float
Sum of squared distances of samples to their closest cluster center,
weighted by the sample weights if provided.
n_features_in_ : int
Number of features seen during :term:`fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
See Also
--------
KMeans : Original implementation of K-Means algorithm.
Notes
-----
It might be inefficient when n_cluster is less than 3, due to unnecessary
calculations for that case.
Examples
--------
>>> from sklearn.cluster import BisectingKMeans
>>> import numpy as np
>>> X = np.array([[1, 1], [10, 1], [3, 1],
... [10, 0], [2, 1], [10, 2],
... [10, 8], [10, 9], [10, 10]])
>>> bisect_means = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
>>> bisect_means.labels_
array([0, 2, 0, 2, 0, 2, 1, 1, 1], dtype=int32)
>>> bisect_means.predict([[0, 0], [12, 3]])
array([0, 2], dtype=int32)
>>> bisect_means.cluster_centers_
array([[ 2., 1.],
[10., 9.],
[10., 1.]])
"""
_parameter_constraints: dict = {
**_BaseKMeans._parameter_constraints,
"init": [StrOptions({"k-means++", "random"}), callable],
"n_init": [Interval(Integral, 1, None, closed="left")],
"copy_x": ["boolean"],
"algorithm": [StrOptions({"lloyd", "elkan"})],
"bisecting_strategy": [StrOptions({"biggest_inertia", "largest_cluster"})],
}
def __init__(
self,
n_clusters=8,
*,
init="random",
n_init=1,
random_state=None,
max_iter=300,
verbose=0,
tol=1e-4,
copy_x=True,
algorithm="lloyd",
bisecting_strategy="biggest_inertia",
):
super().__init__(
n_clusters=n_clusters,
init=init,
max_iter=max_iter,
verbose=verbose,
random_state=random_state,
tol=tol,
n_init=n_init,
)
self.copy_x = copy_x
self.algorithm = algorithm
self.bisecting_strategy = bisecting_strategy
def _warn_mkl_vcomp(self, n_active_threads):
"""Warn when vcomp and mkl are both present"""
warnings.warn(
"BisectingKMeans is known to have a memory leak on Windows "
"with MKL, when there are less chunks than available "
"threads. You can avoid it by setting the environment"
f" variable OMP_NUM_THREADS={n_active_threads}."
)
def _inertia_per_cluster(self, X, centers, labels, sample_weight):
"""Calculate the sum of squared errors (inertia) per cluster.
Parameters
----------
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
The input samples.
centers : ndarray of shape (n_clusters=2, n_features)
The cluster centers.
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
sample_weight : ndarray of shape (n_samples,)
The weights for each observation in X.
Returns
-------
inertia_per_cluster : ndarray of shape (n_clusters=2,)
Sum of squared errors (inertia) for each cluster.
"""
n_clusters = centers.shape[0] # = 2 since centers comes from a bisection
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
inertia_per_cluster = np.empty(n_clusters)
for label in range(n_clusters):
inertia_per_cluster[label] = _inertia(
X, sample_weight, centers, labels, self._n_threads, single_label=label
)
return inertia_per_cluster
def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
"""Split a cluster into 2 subsclusters.
Parameters
----------
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
Training instances to cluster.
x_squared_norms : ndarray of shape (n_samples,)
Squared euclidean norm of each data point.
sample_weight : ndarray of shape (n_samples,)
The weights for each observation in X.
cluster_to_bisect : _BisectingTree node object
The cluster node to split.
"""
X = X[cluster_to_bisect.indices]
x_squared_norms = x_squared_norms[cluster_to_bisect.indices]
sample_weight = sample_weight[cluster_to_bisect.indices]
best_inertia = None
# Split samples in X into 2 clusters.
# Repeating `n_init` times to obtain best clusters
for _ in range(self.n_init):
centers_init = self._init_centroids(
X,
x_squared_norms=x_squared_norms,
init=self.init,
random_state=self._random_state,
n_centroids=2,
sample_weight=sample_weight,
)
labels, inertia, centers, _ = self._kmeans_single(
X,
sample_weight,
centers_init,
max_iter=self.max_iter,
verbose=self.verbose,
tol=self.tol,
n_threads=self._n_threads,
)
# allow small tolerance on the inertia to accommodate for
# non-deterministic rounding errors due to parallel computation
if best_inertia is None or inertia < best_inertia * (1 - 1e-6):
best_labels = labels
best_centers = centers
best_inertia = inertia
if self.verbose:
print(f"New centroids from bisection: {best_centers}")
if self.bisecting_strategy == "biggest_inertia":
scores = self._inertia_per_cluster(
X, best_centers, best_labels, sample_weight
)
else: # bisecting_strategy == "largest_cluster"
# Using minlength to make sure that we have the counts for both labels even
# if all samples are labelled 0.
scores = np.bincount(best_labels, minlength=2)
cluster_to_bisect.split(best_labels, best_centers, scores)
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None, sample_weight=None):
"""Compute bisecting k-means clustering.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training instances to cluster.
.. note:: The data will be converted to C ordering,
which will cause a memory copy
if the given data is not C-contiguous.
y : Ignored
Not used, present here for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
The weights for each observation in X. If None, all observations
are assigned equal weight. `sample_weight` is not used during
initialization if `init` is a callable.
Returns
-------
self
Fitted estimator.
"""
X = self._validate_data(
X,
accept_sparse="csr",
dtype=[np.float64, np.float32],
order="C",
copy=self.copy_x,
accept_large_sparse=False,
)
self._check_params_vs_input(X)
self._random_state = check_random_state(self.random_state)
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
self._n_threads = _openmp_effective_n_threads()
if self.algorithm == "lloyd" or self.n_clusters == 1:
self._kmeans_single = _kmeans_single_lloyd
self._check_mkl_vcomp(X, X.shape[0])
else:
self._kmeans_single = _kmeans_single_elkan
# Subtract of mean of X for more accurate distance computations
if not sp.issparse(X):
self._X_mean = X.mean(axis=0)
X -= self._X_mean
# Initialize the hierarchical clusters tree
self._bisecting_tree = _BisectingTree(
indices=np.arange(X.shape[0]),
center=X.mean(axis=0),
score=0,
)
x_squared_norms = row_norms(X, squared=True)
for _ in range(self.n_clusters - 1):
# Chose cluster to bisect
cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
# Split this cluster into 2 subclusters
self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect)
# Aggregate final labels and centers from the bisecting tree
self.labels_ = np.full(X.shape[0], -1, dtype=np.int32)
self.cluster_centers_ = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
for i, cluster_node in enumerate(self._bisecting_tree.iter_leaves()):
self.labels_[cluster_node.indices] = i
self.cluster_centers_[i] = cluster_node.center
cluster_node.label = i # label final clusters for future prediction
cluster_node.indices = None # release memory
# Restore original data
if not sp.issparse(X):
X += self._X_mean
self.cluster_centers_ += self._X_mean
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
self.inertia_ = _inertia(
X, sample_weight, self.cluster_centers_, self.labels_, self._n_threads
)
self._n_features_out = self.cluster_centers_.shape[0]
return self
def predict(self, X):
"""Predict which cluster each sample in X belongs to.
Prediction is made by going down the hierarchical tree
in searching of closest leaf cluster.
In the vector quantization literature, `cluster_centers_` is called
the code book and each value returned by `predict` is the index of
the closest code in the code book.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
New data to predict.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
check_is_fitted(self)
X = self._check_test_data(X)
x_squared_norms = row_norms(X, squared=True)
# sample weights are unused but necessary in cython helpers
sample_weight = np.ones_like(x_squared_norms)
labels = self._predict_recursive(X, sample_weight, self._bisecting_tree)
return labels
def _predict_recursive(self, X, sample_weight, cluster_node):
"""Predict recursively by going down the hierarchical tree.
Parameters
----------
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
The data points, currently assigned to `cluster_node`, to predict between
the subclusters of this node.
sample_weight : ndarray of shape (n_samples,)
The weights for each observation in X.
cluster_node : _BisectingTree node object
The cluster node of the hierarchical tree.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
if cluster_node.left is None:
# This cluster has no subcluster. Labels are just the label of the cluster.
return np.full(X.shape[0], cluster_node.label, dtype=np.int32)
# Determine if data points belong to the left or right subcluster
centers = np.vstack((cluster_node.left.center, cluster_node.right.center))
if hasattr(self, "_X_mean"):
centers += self._X_mean
cluster_labels = _labels_inertia_threadpool_limit(
X,
sample_weight,
centers,
self._n_threads,
return_inertia=False,
)
mask = cluster_labels == 0
# Compute the labels for each subset of the data points.
labels = np.full(X.shape[0], -1, dtype=np.int32)
labels[mask] = self._predict_recursive(
X[mask], sample_weight[mask], cluster_node.left
)
labels[~mask] = self._predict_recursive(
X[~mask], sample_weight[~mask], cluster_node.right
)
return labels
def _more_tags(self):
return {"preserves_dtype": [np.float64, np.float32]}
@@ -0,0 +1,478 @@
"""
DBSCAN: Density-Based Spatial Clustering of Applications with Noise
"""
# Author: Robert Layton <robertlayton@gmail.com>
# Joel Nothman <joel.nothman@gmail.com>
# Lars Buitinck
#
# License: BSD 3 clause
import warnings
from numbers import Integral, Real
import numpy as np
from scipy import sparse
from ..base import BaseEstimator, ClusterMixin, _fit_context
from ..metrics.pairwise import _VALID_METRICS
from ..neighbors import NearestNeighbors
from ..utils._param_validation import Interval, StrOptions, validate_params
from ..utils.validation import _check_sample_weight
from ._dbscan_inner import dbscan_inner
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"sample_weight": ["array-like", None],
},
prefer_skip_nested_validation=False,
)
def dbscan(
X,
eps=0.5,
*,
min_samples=5,
metric="minkowski",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=2,
sample_weight=None,
n_jobs=None,
):
"""Perform DBSCAN clustering from vector array or distance matrix.
Read more in the :ref:`User Guide <dbscan>`.
Parameters
----------
X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``.
eps : float, default=0.5
The maximum distance between two samples for one to be considered
as in the neighborhood of the other. This is not a maximum bound
on the distances of points within a cluster. This is the most
important DBSCAN parameter to choose appropriately for your data set
and distance function.
min_samples : int, default=5
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point. This includes the point itself.
metric : str or callable, default='minkowski'
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit.
X may be a :term:`sparse graph <sparse graph>`,
in which case only "nonzero" elements may be considered neighbors.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.19
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.
leaf_size : int, default=30
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.
p : float, default=2
The power of the Minkowski metric to be used to calculate distance
between points.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search. ``None`` means
1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
using all processors. See :term:`Glossary <n_jobs>` for more details.
If precomputed distance are used, parallel execution is not available
and thus n_jobs will have no effect.
Returns
-------
core_samples : ndarray of shape (n_core_samples,)
Indices of core samples.
labels : ndarray of shape (n_samples,)
Cluster labels for each point. Noisy samples are given the label -1.
See Also
--------
DBSCAN : An estimator interface for this clustering algorithm.
OPTICS : A similar estimator interface clustering at multiple values of
eps. Our implementation is optimized for memory usage.
Notes
-----
For an example, see :ref:`examples/cluster/plot_dbscan.py
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n). It may attract a higher
memory complexity when querying these nearest neighborhoods, depending
on the ``algorithm``.
One way to avoid the query complexity is to pre-compute sparse
neighborhoods in chunks using
:func:`NearestNeighbors.radius_neighbors_graph
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
``mode='distance'``, then using ``metric='precomputed'`` here.
Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.
:class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower
memory usage.
References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
<https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
<10.1145/3068335>`
ACM Transactions on Database Systems (TODS), 42(3), 19.
Examples
--------
>>> from sklearn.cluster import dbscan
>>> X = [[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]
>>> core_samples, labels = dbscan(X, eps=3, min_samples=2)
>>> core_samples
array([0, 1, 2, 3, 4])
>>> labels
array([ 0, 0, 0, 1, 1, -1])
"""
est = DBSCAN(
eps=eps,
min_samples=min_samples,
metric=metric,
metric_params=metric_params,
algorithm=algorithm,
leaf_size=leaf_size,
p=p,
n_jobs=n_jobs,
)
est.fit(X, sample_weight=sample_weight)
return est.core_sample_indices_, est.labels_
class DBSCAN(ClusterMixin, BaseEstimator):
"""Perform DBSCAN clustering from vector array or distance matrix.
DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
Finds core samples of high density and expands clusters from them.
Good for data which contains clusters of similar density.
This implementation has a worst case memory complexity of :math:`O({n}^2)`,
which can occur when the `eps` param is large and `min_samples` is low,
while the original DBSCAN only uses linear memory.
For further details, see the Notes below.
Read more in the :ref:`User Guide <dbscan>`.
Parameters
----------
eps : float, default=0.5
The maximum distance between two samples for one to be considered
as in the neighborhood of the other. This is not a maximum bound
on the distances of points within a cluster. This is the most
important DBSCAN parameter to choose appropriately for your data set
and distance function.
min_samples : int, default=5
The number of samples (or total weight) in a neighborhood for a point to
be considered as a core point. This includes the point itself. If
`min_samples` is set to a higher value, DBSCAN will find denser clusters,
whereas if it is set to a lower value, the found clusters will be more
sparse.
metric : str, or callable, default='euclidean'
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a :term:`sparse graph`, in which
case only "nonzero" elements may be considered neighbors for DBSCAN.
.. versionadded:: 0.17
metric *precomputed* to accept precomputed sparse matrix.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.19
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.
leaf_size : int, default=30
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.
p : float, default=None
The power of the Minkowski metric to be used to calculate distance
between points. If None, then ``p=2`` (equivalent to the Euclidean
distance).
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
core_sample_indices_ : ndarray of shape (n_core_samples,)
Indices of core samples.
components_ : ndarray of shape (n_core_samples, n_features)
Copy of each core sample found by training.
labels_ : ndarray of shape (n_samples)
Cluster labels for each point in the dataset given to fit().
Noisy samples are given the label -1.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
OPTICS : A similar clustering at multiple values of eps. Our implementation
is optimized for memory usage.
Notes
-----
For an example, see :ref:`examples/cluster/plot_dbscan.py
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n). It may attract a higher
memory complexity when querying these nearest neighborhoods, depending
on the ``algorithm``.
One way to avoid the query complexity is to pre-compute sparse
neighborhoods in chunks using
:func:`NearestNeighbors.radius_neighbors_graph
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
``mode='distance'``, then using ``metric='precomputed'`` here.
Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.
:class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory
usage.
References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
<https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
<10.1145/3068335>`
ACM Transactions on Database Systems (TODS), 42(3), 19.
Examples
--------
>>> from sklearn.cluster import DBSCAN
>>> import numpy as np
>>> X = np.array([[1, 2], [2, 2], [2, 3],
... [8, 7], [8, 8], [25, 80]])
>>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
>>> clustering.labels_
array([ 0, 0, 0, 1, 1, -1])
>>> clustering
DBSCAN(eps=3, min_samples=2)
"""
_parameter_constraints: dict = {
"eps": [Interval(Real, 0.0, None, closed="neither")],
"min_samples": [Interval(Integral, 1, None, closed="left")],
"metric": [
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
callable,
],
"metric_params": [dict, None],
"algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
"leaf_size": [Interval(Integral, 1, None, closed="left")],
"p": [Interval(Real, 0.0, None, closed="left"), None],
"n_jobs": [Integral, None],
}
def __init__(
self,
eps=0.5,
*,
min_samples=5,
metric="euclidean",
metric_params=None,
algorithm="auto",
leaf_size=30,
p=None,
n_jobs=None,
):
self.eps = eps
self.min_samples = min_samples
self.metric = metric
self.metric_params = metric_params
self.algorithm = algorithm
self.leaf_size = leaf_size
self.p = p
self.n_jobs = n_jobs
@_fit_context(
# DBSCAN.metric is not validated yet
prefer_skip_nested_validation=False
)
def fit(self, X, y=None, sample_weight=None):
"""Perform DBSCAN clustering from features, or distance matrix.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``metric='precomputed'``. If a sparse matrix is provided, it will
be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with a
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
Returns
-------
self : object
Returns a fitted instance of self.
"""
X = self._validate_data(X, accept_sparse="csr")
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
# Calculate neighborhood for all samples. This leaves the original
# point in, which needs to be considered later (i.e. point i is in the
# neighborhood of point i. While True, its useless information)
if self.metric == "precomputed" and sparse.issparse(X):
# set the diagonal to explicit values, as a point is its own
# neighbor
X = X.copy() # copy to avoid in-place modification
with warnings.catch_warnings():
warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
X.setdiag(X.diagonal())
neighbors_model = NearestNeighbors(
radius=self.eps,
algorithm=self.algorithm,
leaf_size=self.leaf_size,
metric=self.metric,
metric_params=self.metric_params,
p=self.p,
n_jobs=self.n_jobs,
)
neighbors_model.fit(X)
# This has worst case O(n^2) memory complexity
neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)
if sample_weight is None:
n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
else:
n_neighbors = np.array(
[np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]
)
# Initially, all samples are noise.
labels = np.full(X.shape[0], -1, dtype=np.intp)
# A list of all core samples found.
core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)
dbscan_inner(core_samples, neighborhoods, labels)
self.core_sample_indices_ = np.where(core_samples)[0]
self.labels_ = labels
if len(self.core_sample_indices_):
# fix for scipy sparse indexing issue
self.components_ = X[self.core_sample_indices_].copy()
else:
# no core samples
self.components_ = np.empty((0, X.shape[1]))
return self
def fit_predict(self, X, y=None, sample_weight=None):
"""Compute clusters from a data or distance matrix and predict labels.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``metric='precomputed'``. If a sparse matrix is provided, it will
be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with a
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels. Noisy samples are given the label -1.
"""
self.fit(X, sample_weight=sample_weight)
return self.labels_
def _more_tags(self):
return {"pairwise": self.metric == "precomputed"}
@@ -0,0 +1,40 @@
# Fast inner loop for DBSCAN.
# Author: Lars Buitinck
# License: 3-clause BSD
from libcpp.vector cimport vector
from ..utils._typedefs cimport uint8_t, intp_t
def dbscan_inner(const uint8_t[::1] is_core,
object[:] neighborhoods,
intp_t[::1] labels):
cdef intp_t i, label_num = 0, v
cdef intp_t[:] neighb
cdef vector[intp_t] stack
for i in range(labels.shape[0]):
if labels[i] != -1 or not is_core[i]:
continue
# Depth-first search starting from i, ending at the non-core points.
# This is very similar to the classic algorithm for computing connected
# components, the difference being that we label non-core points as
# part of a cluster (component), but don't expand their neighborhoods.
while True:
if labels[i] == -1:
labels[i] = label_num
if is_core[i]:
neighb = neighborhoods[i]
for i in range(neighb.shape[0]):
v = neighb[i]
if labels[v] == -1:
stack.push_back(v)
if stack.size() == 0:
break
i = stack.back()
stack.pop_back()
label_num += 1
@@ -0,0 +1,92 @@
"""
Feature agglomeration. Base classes and functions for performing feature
agglomeration.
"""
# Author: V. Michel, A. Gramfort
# License: BSD 3 clause
import numpy as np
from scipy.sparse import issparse
from ..base import TransformerMixin
from ..utils import metadata_routing
from ..utils.deprecation import _deprecate_Xt_in_inverse_transform
from ..utils.validation import check_is_fitted
###############################################################################
# Mixin class for feature agglomeration.
class AgglomerationTransform(TransformerMixin):
"""
A class for feature agglomeration via the transform interface.
"""
# This prevents ``set_split_inverse_transform`` to be generated for the
# non-standard ``Xt`` arg on ``inverse_transform``.
# TODO(1.7): remove when Xt is removed for inverse_transform.
__metadata_request__inverse_transform = {"Xt": metadata_routing.UNUSED}
def transform(self, X):
"""
Transform a new matrix using the built clustering.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
A M by N array of M observations in N dimensions or a length
M array of M one-dimensional observations.
Returns
-------
Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,)
The pooled values for each feature cluster.
"""
check_is_fitted(self)
X = self._validate_data(X, reset=False)
if self.pooling_func == np.mean and not issparse(X):
size = np.bincount(self.labels_)
n_samples = X.shape[0]
# a fast way to compute the mean of grouped features
nX = np.array(
[np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)]
)
else:
nX = [
self.pooling_func(X[:, self.labels_ == l], axis=1)
for l in np.unique(self.labels_)
]
nX = np.array(nX).T
return nX
def inverse_transform(self, X=None, *, Xt=None):
"""
Inverse the transformation and return a vector of size `n_features`.
Parameters
----------
X : array-like of shape (n_samples, n_clusters) or (n_clusters,)
The values to be assigned to each cluster of samples.
Xt : array-like of shape (n_samples, n_clusters) or (n_clusters,)
The values to be assigned to each cluster of samples.
.. deprecated:: 1.5
`Xt` was deprecated in 1.5 and will be removed in 1.7. Use `X` instead.
Returns
-------
X : ndarray of shape (n_samples, n_features) or (n_features,)
A vector of size `n_samples` with the values of `Xred` assigned to
each of the cluster of samples.
"""
X = _deprecate_Xt_in_inverse_transform(X, Xt)
check_is_fitted(self)
unil, inverse = np.unique(self.labels_, return_inverse=True)
return X[..., inverse]
@@ -0,0 +1,272 @@
# Minimum spanning tree single linkage implementation for hdbscan
# Authors: Leland McInnes <leland.mcinnes@gmail.com>
# Steve Astels <sastels@gmail.com>
# Meekail Zain <zainmeekail@gmail.com>
# Copyright (c) 2015, Leland McInnes
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
cimport numpy as cnp
from libc.float cimport DBL_MAX
import numpy as np
from ...metrics._dist_metrics cimport DistanceMetric64
from ...cluster._hierarchical_fast cimport UnionFind
from ...cluster._hdbscan._tree cimport HIERARCHY_t
from ...cluster._hdbscan._tree import HIERARCHY_dtype
from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t
cnp.import_array()
cdef extern from "numpy/arrayobject.h":
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
# Numpy structured dtype representing a single ordered edge in Prim's algorithm
MST_edge_dtype = np.dtype([
("current_node", np.int64),
("next_node", np.int64),
("distance", np.float64),
])
# Packed shouldn't make a difference since they're all 8-byte quantities,
# but it's included just to be safe.
ctypedef packed struct MST_edge_t:
int64_t current_node
int64_t next_node
float64_t distance
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
cnp.ndarray[float64_t, ndim=2] mutual_reachability
):
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
reachability graph using Prim's algorithm.
Parameters
----------
mutual_reachability : ndarray of shape (n_samples, n_samples)
Array of mutual-reachabilities between samples.
Returns
-------
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
The MST representation of the mutual-reahability graph. The MST is
represented as a collecteion of edges.
"""
cdef:
# Note: we utilize ndarray's over memory-views to make use of numpy
# binary indexing and sub-selection below.
cnp.ndarray[int64_t, ndim=1, mode='c'] current_labels
cnp.ndarray[float64_t, ndim=1, mode='c'] min_reachability, left, right
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
cnp.ndarray[uint8_t, mode='c'] label_filter
int64_t n_samples = PyArray_SHAPE(<cnp.PyArrayObject*> mutual_reachability)[0]
int64_t current_node, new_node_index, new_node, i
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
current_labels = np.arange(n_samples, dtype=np.int64)
current_node = 0
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
for i in range(0, n_samples - 1):
label_filter = current_labels != current_node
current_labels = current_labels[label_filter]
left = min_reachability[label_filter]
right = mutual_reachability[current_node][current_labels]
min_reachability = np.minimum(left, right)
new_node_index = np.argmin(min_reachability)
new_node = current_labels[new_node_index]
mst[i].current_node = current_node
mst[i].next_node = new_node
mst[i].distance = min_reachability[new_node_index]
current_node = new_node
return mst
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
const float64_t[:, ::1] raw_data,
const float64_t[::1] core_distances,
DistanceMetric64 dist_metric,
float64_t alpha=1.0
):
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
reachability graph generated from the provided `raw_data` and
`core_distances` using Prim's algorithm.
Parameters
----------
raw_data : ndarray of shape (n_samples, n_features)
Input array of data samples.
core_distances : ndarray of shape (n_samples,)
An array containing the core-distance calculated for each corresponding
sample.
dist_metric : DistanceMetric
The distance metric to use when calculating pairwise distances for
determining mutual-reachability.
Returns
-------
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
The MST representation of the mutual-reahability graph. The MST is
represented as a collecteion of edges.
"""
cdef:
uint8_t[::1] in_tree
float64_t[::1] min_reachability
int64_t[::1] current_sources
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
int64_t current_node, source_node, new_node, next_node_source
int64_t i, j, n_samples, num_features
float64_t current_node_core_dist, new_reachability, mutual_reachability_distance
float64_t next_node_min_reach, pair_distance, next_node_core_dist
n_samples = raw_data.shape[0]
num_features = raw_data.shape[1]
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
in_tree = np.zeros(n_samples, dtype=np.uint8)
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
current_sources = np.ones(n_samples, dtype=np.int64)
current_node = 0
for i in range(0, n_samples - 1):
in_tree[current_node] = 1
current_node_core_dist = core_distances[current_node]
new_reachability = DBL_MAX
source_node = 0
new_node = 0
for j in range(n_samples):
if in_tree[j]:
continue
next_node_min_reach = min_reachability[j]
next_node_source = current_sources[j]
pair_distance = dist_metric.dist(
&raw_data[current_node, 0],
&raw_data[j, 0],
num_features
)
pair_distance /= alpha
next_node_core_dist = core_distances[j]
mutual_reachability_distance = max(
current_node_core_dist,
next_node_core_dist,
pair_distance
)
if mutual_reachability_distance > next_node_min_reach:
if next_node_min_reach < new_reachability:
new_reachability = next_node_min_reach
source_node = next_node_source
new_node = j
continue
if mutual_reachability_distance < next_node_min_reach:
min_reachability[j] = mutual_reachability_distance
current_sources[j] = current_node
if mutual_reachability_distance < new_reachability:
new_reachability = mutual_reachability_distance
source_node = current_node
new_node = j
else:
if next_node_min_reach < new_reachability:
new_reachability = next_node_min_reach
source_node = next_node_source
new_node = j
mst[i].current_node = source_node
mst[i].next_node = new_node
mst[i].distance = new_reachability
current_node = new_node
return mst
cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst):
"""Construct a single-linkage tree from an MST.
Parameters
----------
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
The MST representation of the mutual-reahability graph. The MST is
represented as a collecteion of edges.
Returns
-------
single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
The single-linkage tree tree (dendrogram) built from the MST. Each
of the array represents the following:
- left node/cluster
- right node/cluster
- distance
- new cluster size
"""
cdef:
cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage
# Note mst.shape[0] is one fewer than the number of samples
int64_t n_samples = mst.shape[0] + 1
intp_t current_node_cluster, next_node_cluster
int64_t current_node, next_node, i
float64_t distance
UnionFind U = UnionFind(n_samples)
single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype)
for i in range(n_samples - 1):
current_node = mst[i].current_node
next_node = mst[i].next_node
distance = mst[i].distance
current_node_cluster = U.fast_find(current_node)
next_node_cluster = U.fast_find(next_node)
single_linkage[i].left_node = current_node_cluster
single_linkage[i].right_node = next_node_cluster
single_linkage[i].value = distance
single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster]
U.union(current_node_cluster, next_node_cluster)
return single_linkage
@@ -0,0 +1,212 @@
# mutual reachability distance computations
# Authors: Leland McInnes <leland.mcinnes@gmail.com>
# Meekail Zain <zainmeekail@gmail.com>
# Guillaume Lemaitre <g.lemaitre58@gmail.com>
# Copyright (c) 2015, Leland McInnes
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
cimport numpy as cnp
import numpy as np
from scipy.sparse import issparse
from cython cimport floating, integral
from libc.math cimport isfinite, INFINITY
from ...utils._typedefs cimport intp_t
cnp.import_array()
def mutual_reachability_graph(
distance_matrix, min_samples=5, max_distance=0.0
):
"""Compute the weighted adjacency matrix of the mutual reachability graph.
The mutual reachability distance used to build the graph is defined as::
max(d_core(x_p), d_core(x_q), d(x_p, x_q))
and the core distance `d_core` is defined as the distance between a point
`x_p` and its k-th nearest neighbor.
Note that all computations are done in-place.
Parameters
----------
distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
Array of distances between samples. If sparse, the array must be in
`CSR` format.
min_samples : int, default=5
The number of points in a neighbourhood for a point to be considered
a core point.
max_distance : float, default=0.0
The distance which `np.inf` is replaced with. When the true mutual-
reachability distance is measured to be infinite, it is instead
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
matrix.
Returns
-------
mututal_reachability_graph: {ndarray, sparse matrix} of shape \
(n_samples, n_samples)
Weighted adjacency matrix of the mutual reachability graph.
References
----------
.. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
Density-based clustering based on hierarchical density estimates.
In Pacific-Asia Conference on Knowledge Discovery and Data Mining
(pp. 160-172). Springer Berlin Heidelberg.
"""
further_neighbor_idx = min_samples - 1
if issparse(distance_matrix):
if distance_matrix.format != "csr":
raise ValueError(
"Only sparse CSR matrices are supported for `distance_matrix`."
)
_sparse_mutual_reachability_graph(
distance_matrix.data,
distance_matrix.indices,
distance_matrix.indptr,
distance_matrix.shape[0],
further_neighbor_idx=further_neighbor_idx,
max_distance=max_distance,
)
else:
_dense_mutual_reachability_graph(
distance_matrix, further_neighbor_idx=further_neighbor_idx
)
return distance_matrix
def _dense_mutual_reachability_graph(
floating[:, :] distance_matrix,
intp_t further_neighbor_idx,
):
"""Dense implementation of mutual reachability graph.
The computation is done in-place, i.e. the distance matrix is modified
directly.
Parameters
----------
distance_matrix : ndarray of shape (n_samples, n_samples)
Array of distances between samples.
further_neighbor_idx : int
The index of the furthest neighbor to use to define the core distances.
"""
cdef:
intp_t i, j, n_samples = distance_matrix.shape[0]
floating mutual_reachibility_distance
floating[::1] core_distances
# We assume that the distance matrix is symmetric. We choose to sort every
# row to have the same implementation than the sparse case that requires
# CSR matrix.
core_distances = np.ascontiguousarray(
np.partition(
distance_matrix, further_neighbor_idx, axis=1
)[:, further_neighbor_idx]
)
with nogil:
# TODO: Update w/ prange with thread count based on
# _openmp_effective_n_threads
for i in range(n_samples):
for j in range(n_samples):
mutual_reachibility_distance = max(
core_distances[i],
core_distances[j],
distance_matrix[i, j],
)
distance_matrix[i, j] = mutual_reachibility_distance
def _sparse_mutual_reachability_graph(
cnp.ndarray[floating, ndim=1, mode="c"] data,
cnp.ndarray[integral, ndim=1, mode="c"] indices,
cnp.ndarray[integral, ndim=1, mode="c"] indptr,
intp_t n_samples,
intp_t further_neighbor_idx,
floating max_distance,
):
"""Sparse implementation of mutual reachability graph.
The computation is done in-place, i.e. the distance matrix is modified
directly. This implementation only accepts `CSR` format sparse matrices.
Parameters
----------
distance_matrix : sparse matrix of shape (n_samples, n_samples)
Sparse matrix of distances between samples. The sparse format should
be `CSR`.
further_neighbor_idx : int
The index of the furthest neighbor to use to define the core distances.
max_distance : float
The distance which `np.inf` is replaced with. When the true mutual-
reachability distance is measured to be infinite, it is instead
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
matrix.
"""
cdef:
integral i, col_ind, row_ind
floating mutual_reachibility_distance
floating[:] core_distances
floating[:] row_data
if floating is float:
dtype = np.float32
else:
dtype = np.float64
core_distances = np.empty(n_samples, dtype=dtype)
for i in range(n_samples):
row_data = data[indptr[i]:indptr[i + 1]]
if further_neighbor_idx < row_data.size:
core_distances[i] = np.partition(
row_data, further_neighbor_idx
)[further_neighbor_idx]
else:
core_distances[i] = INFINITY
with nogil:
for row_ind in range(n_samples):
for i in range(indptr[row_ind], indptr[row_ind + 1]):
col_ind = indices[i]
mutual_reachibility_distance = max(
core_distances[row_ind], core_distances[col_ind], data[i]
)
if isfinite(mutual_reachibility_distance):
data[i] = mutual_reachibility_distance
elif max_distance > 0:
data[i] = max_distance
@@ -0,0 +1,49 @@
# Copyright (c) 2015, Leland McInnes
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from ...utils._typedefs cimport intp_t, float64_t, uint8_t
cimport numpy as cnp
# This corresponds to the scipy.cluster.hierarchy format
ctypedef packed struct HIERARCHY_t:
intp_t left_node
intp_t right_node
float64_t value
intp_t cluster_size
# Effectively an edgelist encoding a parent/child pair, along with a value and
# the corresponding cluster_size in each row providing a tree structure.
ctypedef packed struct CONDENSED_t:
intp_t parent
intp_t child
float64_t value
intp_t cluster_size
cdef extern from "numpy/arrayobject.h":
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
@@ -0,0 +1,799 @@
# Tree handling (condensing, finding stable clusters) for hdbscan
# Authors: Leland McInnes
# Copyright (c) 2015, Leland McInnes
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
cimport numpy as cnp
from libc.math cimport isinf
import cython
import numpy as np
cnp.import_array()
cdef extern from "numpy/arrayobject.h":
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
cdef cnp.float64_t INFTY = np.inf
cdef cnp.intp_t NOISE = -1
HIERARCHY_dtype = np.dtype([
("left_node", np.intp),
("right_node", np.intp),
("value", np.float64),
("cluster_size", np.intp),
])
CONDENSED_dtype = np.dtype([
("parent", np.intp),
("child", np.intp),
("value", np.float64),
("cluster_size", np.intp),
])
cpdef tuple tree_to_labels(
const HIERARCHY_t[::1] single_linkage_tree,
cnp.intp_t min_cluster_size=10,
cluster_selection_method="eom",
bint allow_single_cluster=False,
cnp.float64_t cluster_selection_epsilon=0.0,
max_cluster_size=None,
):
cdef:
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities
condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size)
labels, probabilities = _get_clusters(
condensed_tree,
_compute_stability(condensed_tree),
cluster_selection_method,
allow_single_cluster,
cluster_selection_epsilon,
max_cluster_size,
)
return (labels, probabilities)
cdef list bfs_from_hierarchy(
const HIERARCHY_t[::1] hierarchy,
cnp.intp_t bfs_root
):
"""
Perform a breadth first search on a tree in scipy hclust format.
"""
cdef list process_queue, next_queue, result
cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1
cdef cnp.intp_t node
process_queue = [bfs_root]
result = []
while process_queue:
result.extend(process_queue)
# By construction, node i is formed by the union of nodes
# hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1]
process_queue = [
x - n_samples
for x in process_queue
if x >= n_samples
]
if process_queue:
next_queue = []
for node in process_queue:
next_queue.extend(
[
hierarchy[node].left_node,
hierarchy[node].right_node,
]
)
process_queue = next_queue
return result
cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
const HIERARCHY_t[::1] hierarchy,
cnp.intp_t min_cluster_size=10
):
"""Condense a tree according to a minimum cluster size. This is akin
to the runt pruning procedure of Stuetzle. The result is a much simpler
tree that is easier to visualize. We include extra information on the
lambda value at which individual points depart clusters for later
analysis and computation.
Parameters
----------
hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
A single linkage hierarchy in scipy.cluster.hierarchy format.
min_cluster_size : int, optional (default 10)
The minimum size of clusters to consider. Clusters smaller than this
are pruned from the tree.
Returns
-------
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
Effectively an edgelist encoding a parent/child pair, along with a
value and the corresponding cluster_size in each row providing a tree
structure.
"""
cdef:
cnp.intp_t root = 2 * hierarchy.shape[0]
cnp.intp_t n_samples = hierarchy.shape[0] + 1
cnp.intp_t next_label = n_samples + 1
list result_list, node_list = bfs_from_hierarchy(hierarchy, root)
cnp.intp_t[::1] relabel
cnp.uint8_t[::1] ignore
cnp.intp_t node, sub_node, left, right
cnp.float64_t lambda_value, distance
cnp.intp_t left_count, right_count
HIERARCHY_t children
relabel = np.empty(root + 1, dtype=np.intp)
relabel[root] = n_samples
result_list = []
ignore = np.zeros(len(node_list), dtype=bool)
for node in node_list:
if ignore[node] or node < n_samples:
continue
children = hierarchy[node - n_samples]
left = children.left_node
right = children.right_node
distance = children.value
if distance > 0.0:
lambda_value = 1.0 / distance
else:
lambda_value = INFTY
if left >= n_samples:
left_count = hierarchy[left - n_samples].cluster_size
else:
left_count = 1
if right >= n_samples:
right_count = <cnp.intp_t> hierarchy[right - n_samples].cluster_size
else:
right_count = 1
if left_count >= min_cluster_size and right_count >= min_cluster_size:
relabel[left] = next_label
next_label += 1
result_list.append(
(relabel[node], relabel[left], lambda_value, left_count)
)
relabel[right] = next_label
next_label += 1
result_list.append(
(relabel[node], relabel[right], lambda_value, right_count)
)
elif left_count < min_cluster_size and right_count < min_cluster_size:
for sub_node in bfs_from_hierarchy(hierarchy, left):
if sub_node < n_samples:
result_list.append(
(relabel[node], sub_node, lambda_value, 1)
)
ignore[sub_node] = True
for sub_node in bfs_from_hierarchy(hierarchy, right):
if sub_node < n_samples:
result_list.append(
(relabel[node], sub_node, lambda_value, 1)
)
ignore[sub_node] = True
elif left_count < min_cluster_size:
relabel[right] = relabel[node]
for sub_node in bfs_from_hierarchy(hierarchy, left):
if sub_node < n_samples:
result_list.append(
(relabel[node], sub_node, lambda_value, 1)
)
ignore[sub_node] = True
else:
relabel[left] = relabel[node]
for sub_node in bfs_from_hierarchy(hierarchy, right):
if sub_node < n_samples:
result_list.append(
(relabel[node], sub_node, lambda_value, 1)
)
ignore[sub_node] = True
return np.array(result_list, dtype=CONDENSED_dtype)
cdef dict _compute_stability(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
):
cdef:
cnp.float64_t[::1] result, births
cnp.intp_t[:] parents = condensed_tree['parent']
cnp.intp_t parent, cluster_size, result_index, idx
cnp.float64_t lambda_val
CONDENSED_t condensed_node
cnp.intp_t largest_child = condensed_tree['child'].max()
cnp.intp_t smallest_cluster = np.min(parents)
cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
dict stability_dict = {}
largest_child = max(largest_child, smallest_cluster)
births = np.full(largest_child + 1, np.nan, dtype=np.float64)
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
condensed_node = condensed_tree[idx]
births[condensed_node.child] = condensed_node.value
births[smallest_cluster] = 0.0
result = np.zeros(num_clusters, dtype=np.float64)
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
condensed_node = condensed_tree[idx]
parent = condensed_node.parent
lambda_val = condensed_node.value
cluster_size = condensed_node.cluster_size
result_index = parent - smallest_cluster
result[result_index] += (lambda_val - births[parent]) * cluster_size
for idx in range(num_clusters):
stability_dict[idx + smallest_cluster] = result[idx]
return stability_dict
cdef list bfs_from_cluster_tree(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
cnp.intp_t bfs_root
):
cdef:
list result = []
cnp.ndarray[cnp.intp_t, ndim=1] process_queue = (
np.array([bfs_root], dtype=np.intp)
)
cnp.ndarray[cnp.intp_t, ndim=1] children = condensed_tree['child']
cnp.intp_t[:] parents = condensed_tree['parent']
while len(process_queue) > 0:
result.extend(process_queue.tolist())
process_queue = children[np.isin(parents, process_queue)]
return result
cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree):
cdef:
cnp.intp_t parent, current_parent, idx
cnp.float64_t lambda_val, max_lambda
cnp.float64_t[::1] deaths
cnp.intp_t largest_parent = condensed_tree['parent'].max()
deaths = np.zeros(largest_parent + 1, dtype=np.float64)
current_parent = condensed_tree[0].parent
max_lambda = condensed_tree[0].value
for idx in range(1, PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
parent = condensed_tree[idx].parent
lambda_val = condensed_tree[idx].value
if parent == current_parent:
max_lambda = max(max_lambda, lambda_val)
else:
deaths[current_parent] = max_lambda
current_parent = parent
max_lambda = lambda_val
deaths[current_parent] = max_lambda # value for last parent
return deaths
@cython.final
cdef class TreeUnionFind:
cdef cnp.intp_t[:, ::1] data
cdef cnp.uint8_t[::1] is_component
def __init__(self, size):
cdef cnp.intp_t idx
self.data = np.zeros((size, 2), dtype=np.intp)
for idx in range(size):
self.data[idx, 0] = idx
self.is_component = np.ones(size, dtype=np.uint8)
cdef void union(self, cnp.intp_t x, cnp.intp_t y):
cdef cnp.intp_t x_root = self.find(x)
cdef cnp.intp_t y_root = self.find(y)
if self.data[x_root, 1] < self.data[y_root, 1]:
self.data[x_root, 0] = y_root
elif self.data[x_root, 1] > self.data[y_root, 1]:
self.data[y_root, 0] = x_root
else:
self.data[y_root, 0] = x_root
self.data[x_root, 1] += 1
return
cdef cnp.intp_t find(self, cnp.intp_t x):
if self.data[x, 0] != x:
self.data[x, 0] = self.find(self.data[x, 0])
self.is_component[x] = False
return self.data[x, 0]
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
const HIERARCHY_t[::1] linkage,
cnp.float64_t cut,
cnp.intp_t min_cluster_size
):
"""Given a single linkage tree and a cut value, return the
vector of cluster labels at that cut value. This is useful
for Robust Single Linkage, and extracting DBSCAN results
from a single HDBSCAN run.
Parameters
----------
linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
The single linkage tree in scipy.cluster.hierarchy format.
cut : double
The cut value at which to find clusters.
min_cluster_size : int
The minimum cluster size; clusters below this size at
the cut will be considered noise.
Returns
-------
labels : ndarray of shape (n_samples,)
The cluster labels for each point in the data set;
a label of -1 denotes a noise assignment.
"""
cdef:
cnp.intp_t n, cluster, root, n_samples, cluster_label
cnp.intp_t[::1] unique_labels, cluster_size
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
TreeUnionFind union_find
dict cluster_label_map
HIERARCHY_t node
root = 2 * linkage.shape[0]
n_samples = root // 2 + 1
result = np.empty(n_samples, dtype=np.intp)
union_find = TreeUnionFind(root + 1)
cluster = n_samples
for node in linkage:
if node.value < cut:
union_find.union(node.left_node, cluster)
union_find.union(node.right_node, cluster)
cluster += 1
cluster_size = np.zeros(cluster, dtype=np.intp)
for n in range(n_samples):
cluster = union_find.find(n)
cluster_size[cluster] += 1
result[n] = cluster
cluster_label_map = {-1: NOISE}
cluster_label = 0
unique_labels = np.unique(result)
for cluster in unique_labels:
if cluster_size[cluster] < min_cluster_size:
cluster_label_map[cluster] = NOISE
else:
cluster_label_map[cluster] = cluster_label
cluster_label += 1
for n in range(n_samples):
result[n] = cluster_label_map[result[n]]
return result
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
set clusters,
dict cluster_label_map,
cnp.intp_t allow_single_cluster,
cnp.float64_t cluster_selection_epsilon
):
"""Given a condensed tree, clusters and a labeling map for the clusters,
return an array containing the labels of each point based on cluster
membership. Note that this is where points may be marked as noisy
outliers. The determination of some points as noise is in large, single-
cluster datasets is controlled by the `allow_single_cluster` and
`cluster_selection_epsilon` parameters.
Parameters
----------
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
Effectively an edgelist encoding a parent/child pair, along with a
value and the corresponding cluster_size in each row providing a tree
structure.
clusters : set
The set of nodes corresponding to identified clusters. These node
values should be the same as those present in `condensed_tree`.
cluster_label_map : dict
A mapping from the node values present in `clusters` to the labels
which will be returned.
Returns
-------
labels : ndarray of shape (n_samples,)
The cluster labels for each point in the data set;
a label of -1 denotes a noise assignment.
"""
cdef:
cnp.intp_t root_cluster
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array
cnp.ndarray[cnp.float64_t, ndim=1] lambda_array
TreeUnionFind union_find
cnp.intp_t n, parent, child, cluster
cnp.float64_t threshold
child_array = condensed_tree['child']
parent_array = condensed_tree['parent']
lambda_array = condensed_tree['value']
root_cluster = np.min(parent_array)
result = np.empty(root_cluster, dtype=np.intp)
union_find = TreeUnionFind(np.max(parent_array) + 1)
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
child = child_array[n]
parent = parent_array[n]
if child not in clusters:
union_find.union(parent, child)
for n in range(root_cluster):
cluster = union_find.find(n)
label = NOISE
if cluster != root_cluster:
label = cluster_label_map[cluster]
elif len(clusters) == 1 and allow_single_cluster:
# There can only be one edge with this particular child hence this
# expression extracts a unique, scalar lambda value.
parent_lambda = lambda_array[child_array == n]
if cluster_selection_epsilon != 0.0:
threshold = 1 / cluster_selection_epsilon
else:
# The threshold should be calculated per-sample based on the
# largest lambda of any simbling node.
threshold = lambda_array[parent_array == cluster].max()
if parent_lambda >= threshold:
label = cluster_label_map[cluster]
result[n] = label
return result
cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
dict cluster_map,
cnp.intp_t[::1] labels
):
cdef:
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result
cnp.float64_t[:] lambda_array
cnp.float64_t[::1] deaths
cnp.intp_t[:] child_array, parent_array
cnp.intp_t root_cluster, n, point, cluster_num, cluster
cnp.float64_t max_lambda, lambda_val
child_array = condensed_tree['child']
parent_array = condensed_tree['parent']
lambda_array = condensed_tree['value']
result = np.zeros(labels.shape[0])
deaths = max_lambdas(condensed_tree)
root_cluster = np.min(parent_array)
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
point = child_array[n]
if point >= root_cluster:
continue
cluster_num = labels[point]
if cluster_num == -1:
continue
cluster = cluster_map[cluster_num]
max_lambda = deaths[cluster]
if max_lambda == 0.0 or isinf(lambda_array[n]):
result[point] = 1.0
else:
lambda_val = min(lambda_array[n], max_lambda)
result[point] = lambda_val / max_lambda
return result
cpdef list recurse_leaf_dfs(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
cnp.intp_t current_node
):
cdef cnp.intp_t[:] children
cdef cnp.intp_t child
children = cluster_tree[cluster_tree['parent'] == current_node]['child']
if children.shape[0] == 0:
return [current_node,]
else:
return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], [])
cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree):
cdef cnp.intp_t root
if PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] == 0:
return []
root = cluster_tree['parent'].min()
return recurse_leaf_dfs(cluster_tree, root)
cdef cnp.intp_t traverse_upwards(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
cnp.float64_t cluster_selection_epsilon,
cnp.intp_t leaf,
cnp.intp_t allow_single_cluster
):
cdef cnp.intp_t root, parent
cdef cnp.float64_t parent_eps
root = cluster_tree['parent'].min()
parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
if parent == root:
if allow_single_cluster:
return parent
else:
return leaf # return node closest to root
parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value']
if parent_eps > cluster_selection_epsilon:
return parent
else:
return traverse_upwards(
cluster_tree,
cluster_selection_epsilon,
parent,
allow_single_cluster
)
cdef set epsilon_search(
set leaves,
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
cnp.float64_t cluster_selection_epsilon,
cnp.intp_t allow_single_cluster
):
cdef:
list selected_clusters = list()
list processed = list()
cnp.intp_t leaf, epsilon_child, sub_node
cnp.float64_t eps
cnp.uint8_t[:] leaf_nodes
cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child']
cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value']
for leaf in leaves:
leaf_nodes = children == leaf
eps = 1 / distances[leaf_nodes][0]
if eps < cluster_selection_epsilon:
if leaf not in processed:
epsilon_child = traverse_upwards(
cluster_tree,
cluster_selection_epsilon,
leaf,
allow_single_cluster
)
selected_clusters.append(epsilon_child)
for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
if sub_node != epsilon_child:
processed.append(sub_node)
else:
selected_clusters.append(leaf)
return set(selected_clusters)
@cython.wraparound(True)
cdef tuple _get_clusters(
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
dict stability,
cluster_selection_method='eom',
cnp.uint8_t allow_single_cluster=False,
cnp.float64_t cluster_selection_epsilon=0.0,
max_cluster_size=None
):
"""Given a tree and stability dict, produce the cluster labels
(and probabilities) for a flat clustering based on the chosen
cluster selection method.
Parameters
----------
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
Effectively an edgelist encoding a parent/child pair, along with a
value and the corresponding cluster_size in each row providing a tree
structure.
stability : dict
A dictionary mapping cluster_ids to stability values
cluster_selection_method : string, optional (default 'eom')
The method of selecting clusters. The default is the
Excess of Mass algorithm specified by 'eom'. The alternate
option is 'leaf'.
allow_single_cluster : boolean, optional (default False)
Whether to allow a single cluster to be selected by the
Excess of Mass algorithm.
cluster_selection_epsilon: double, optional (default 0.0)
A distance threshold for cluster splits.
max_cluster_size: int, default=None
The maximum size for clusters located by the EOM clusterer. Can
be overridden by the cluster_selection_epsilon parameter in
rare cases.
Returns
-------
labels : ndarray of shape (n_samples,)
An integer array of cluster labels, with -1 denoting noise.
probabilities : ndarray (n_samples,)
The cluster membership strength of each sample.
stabilities : ndarray (n_clusters,)
The cluster coherence strengths of each cluster.
"""
cdef:
list node_list
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree
cnp.uint8_t[::1] child_selection
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
dict is_cluster, cluster_sizes
cnp.float64_t subtree_stability
cnp.intp_t node, sub_node, cluster, n_samples
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs
# Assume clusters are ordered by numeric id equivalent to
# a topological sort of the tree; This is valid given the
# current implementation above, so don't change that ... or
# if you do, change this accordingly!
if allow_single_cluster:
node_list = sorted(stability.keys(), reverse=True)
else:
node_list = sorted(stability.keys(), reverse=True)[:-1]
# (exclude root)
cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1]
is_cluster = {cluster: True for cluster in node_list}
n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1
if max_cluster_size is None:
max_cluster_size = n_samples + 1 # Set to a value that will never be triggered
cluster_sizes = {
child: cluster_size for child, cluster_size
in zip(cluster_tree['child'], cluster_tree['cluster_size'])
}
if allow_single_cluster:
# Compute cluster size for the root node
cluster_sizes[node_list[-1]] = np.sum(
cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size'])
if cluster_selection_method == 'eom':
for node in node_list:
child_selection = (cluster_tree['parent'] == node)
subtree_stability = np.sum([
stability[child] for
child in cluster_tree['child'][child_selection]])
if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size:
is_cluster[node] = False
stability[node] = subtree_stability
else:
for sub_node in bfs_from_cluster_tree(cluster_tree, node):
if sub_node != node:
is_cluster[sub_node] = False
if cluster_selection_epsilon != 0.0 and PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] > 0:
eom_clusters = [c for c in is_cluster if is_cluster[c]]
selected_clusters = []
# first check if eom_clusters only has root node, which skips epsilon check.
if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()):
if allow_single_cluster:
selected_clusters = eom_clusters
else:
selected_clusters = epsilon_search(
set(eom_clusters),
cluster_tree,
cluster_selection_epsilon,
allow_single_cluster
)
for c in is_cluster:
if c in selected_clusters:
is_cluster[c] = True
else:
is_cluster[c] = False
elif cluster_selection_method == 'leaf':
leaves = set(get_cluster_tree_leaves(cluster_tree))
if len(leaves) == 0:
for c in is_cluster:
is_cluster[c] = False
is_cluster[condensed_tree['parent'].min()] = True
if cluster_selection_epsilon != 0.0:
selected_clusters = epsilon_search(
leaves,
cluster_tree,
cluster_selection_epsilon,
allow_single_cluster
)
else:
selected_clusters = leaves
for c in is_cluster:
if c in selected_clusters:
is_cluster[c] = True
else:
is_cluster[c] = False
clusters = set([c for c in is_cluster if is_cluster[c]])
cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
reverse_cluster_map = {n: c for c, n in cluster_map.items()}
labels = _do_labelling(
condensed_tree,
clusters,
cluster_map,
allow_single_cluster,
cluster_selection_epsilon
)
probs = get_probabilities(condensed_tree, reverse_cluster_map, labels)
return (labels, probs)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,16 @@
cluster_hdbscan_extension_metadata = {
'_linkage': {'sources': ['_linkage.pyx', metrics_cython_tree]},
'_reachability': {'sources': ['_reachability.pyx']},
'_tree': {'sources': ['_tree.pyx']}
}
foreach ext_name, ext_dict : cluster_hdbscan_extension_metadata
py.extension_module(
ext_name,
ext_dict.get('sources'),
dependencies: [np_dep],
cython_args: cython_args,
subdir: 'sklearn/cluster/_hdbscan',
install: true
)
endforeach
@@ -0,0 +1,63 @@
import numpy as np
import pytest
from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
from sklearn.utils._testing import (
_convert_container,
assert_allclose,
)
def test_mutual_reachability_graph_error_sparse_format():
"""Check that we raise an error if the sparse format is not CSR."""
rng = np.random.RandomState(0)
X = rng.randn(10, 10)
X = X.T @ X
np.fill_diagonal(X, 0.0)
X = _convert_container(X, "sparse_csc")
err_msg = "Only sparse CSR matrices are supported"
with pytest.raises(ValueError, match=err_msg):
mutual_reachability_graph(X)
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
def test_mutual_reachability_graph_inplace(array_type):
"""Check that the operation is happening inplace."""
rng = np.random.RandomState(0)
X = rng.randn(10, 10)
X = X.T @ X
np.fill_diagonal(X, 0.0)
X = _convert_container(X, array_type)
mr_graph = mutual_reachability_graph(X)
assert id(mr_graph) == id(X)
def test_mutual_reachability_graph_equivalence_dense_sparse():
"""Check that we get the same results for dense and sparse implementation."""
rng = np.random.RandomState(0)
X = rng.randn(5, 5)
X_dense = X.T @ X
X_sparse = _convert_container(X_dense, "sparse_csr")
mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
assert_allclose(mr_graph_dense, mr_graph_sparse.toarray())
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_mutual_reachability_graph_preserve_dtype(array_type, dtype):
"""Check that the computation preserve dtype thanks to fused types."""
rng = np.random.RandomState(0)
X = rng.randn(10, 10)
X = (X.T @ X).astype(dtype)
np.fill_diagonal(X, 0.0)
X = _convert_container(X, array_type)
assert X.dtype == dtype
mr_graph = mutual_reachability_graph(X)
assert mr_graph.dtype == dtype
@@ -0,0 +1,9 @@
from ..utils._typedefs cimport intp_t
cdef class UnionFind:
cdef intp_t next_label
cdef intp_t[:] parent
cdef intp_t[:] size
cdef void union(self, intp_t m, intp_t n) noexcept
cdef intp_t fast_find(self, intp_t n) noexcept
@@ -0,0 +1,506 @@
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
import numpy as np
cimport cython
from ..metrics._dist_metrics cimport DistanceMetric64
from ..utils._fast_dict cimport IntFloatDict
from ..utils._typedefs cimport float64_t, intp_t, uint8_t
# C++
from cython.operator cimport dereference as deref, preincrement as inc
from libcpp.map cimport map as cpp_map
from libc.math cimport fmax, INFINITY
###############################################################################
# Utilities for computing the ward momentum
def compute_ward_dist(
const float64_t[::1] m_1,
const float64_t[:, ::1] m_2,
const intp_t[::1] coord_row,
const intp_t[::1] coord_col,
float64_t[::1] res
):
cdef intp_t size_max = coord_row.shape[0]
cdef intp_t n_features = m_2.shape[1]
cdef intp_t i, j, row, col
cdef float64_t pa, n
for i in range(size_max):
row = coord_row[i]
col = coord_col[i]
n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col])
pa = 0.
for j in range(n_features):
pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2
res[i] = pa * n
###############################################################################
# Utilities for cutting and exploring a hierarchical tree
def _hc_get_descendent(intp_t node, children, intp_t n_leaves):
"""
Function returning all the descendent leaves of a set of nodes in the tree.
Parameters
----------
node : integer
The node for which we want the descendents.
children : list of pairs, length n_nodes
The children of each non-leaf node. Values less than `n_samples` refer
to leaves of the tree. A greater value `i` indicates a node with
children `children[i - n_samples]`.
n_leaves : integer
Number of leaves.
Returns
-------
descendent : list of int
"""
ind = [node]
if node < n_leaves:
return ind
descendent = []
# It is actually faster to do the accounting of the number of
# elements is the list ourselves: len is a lengthy operation on a
# chained list
cdef intp_t i, n_indices = 1
while n_indices:
i = ind.pop()
if i < n_leaves:
descendent.append(i)
n_indices -= 1
else:
ind.extend(children[i - n_leaves])
n_indices += 1
return descendent
def hc_get_heads(intp_t[:] parents, copy=True):
"""Returns the heads of the forest, as defined by parents.
Parameters
----------
parents : array of integers
The parent structure defining the forest (ensemble of trees)
copy : boolean
If copy is False, the input 'parents' array is modified inplace
Returns
-------
heads : array of integers of same shape as parents
The indices in the 'parents' of the tree heads
"""
cdef intp_t parent, node0, node, size
if copy:
parents = np.copy(parents)
size = parents.size
# Start from the top of the tree and go down
for node0 in range(size - 1, -1, -1):
node = node0
parent = parents[node]
while parent != node:
parents[node0] = parent
node = parent
parent = parents[node]
return parents
def _get_parents(
nodes,
heads,
const intp_t[:] parents,
uint8_t[::1] not_visited
):
"""Returns the heads of the given nodes, as defined by parents.
Modifies 'heads' and 'not_visited' in-place.
Parameters
----------
nodes : list of integers
The nodes to start from
heads : list of integers
A list to hold the results (modified inplace)
parents : array of integers
The parent structure defining the tree
not_visited
The tree nodes to consider (modified inplace)
"""
cdef intp_t parent, node
for node in nodes:
parent = parents[node]
while parent != node:
node = parent
parent = parents[node]
if not_visited[node]:
not_visited[node] = 0
heads.append(node)
###############################################################################
# merge strategies implemented on IntFloatDicts
# These are used in the hierarchical clustering code, to implement
# merging between two clusters, defined as a dict containing node number
# as keys and edge weights as values.
def max_merge(
IntFloatDict a,
IntFloatDict b,
const intp_t[:] mask,
intp_t n_a,
intp_t n_b
):
"""Merge two IntFloatDicts with the max strategy: when the same key is
present in the two dicts, the max of the two values is used.
Parameters
==========
a, b : IntFloatDict object
The IntFloatDicts to merge
mask : ndarray array of dtype integer and of dimension 1
a mask for keys to ignore: if not mask[key] the corresponding key
is skipped in the output dictionary
n_a, n_b : float
n_a and n_b are weights for a and b for the merge strategy.
They are not used in the case of a max merge.
Returns
=======
out : IntFloatDict object
The IntFloatDict resulting from the merge
"""
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
cdef intp_t key
cdef float64_t value
# First copy a into out
while a_it != a_end:
key = deref(a_it).first
if mask[key]:
out_obj.my_map[key] = deref(a_it).second
inc(a_it)
# Then merge b into out
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
while b_it != b_end:
key = deref(b_it).first
value = deref(b_it).second
if mask[key]:
out_it = out_obj.my_map.find(key)
if out_it == out_end:
# Key not found
out_obj.my_map[key] = value
else:
deref(out_it).second = fmax(deref(out_it).second, value)
inc(b_it)
return out_obj
def average_merge(
IntFloatDict a,
IntFloatDict b,
const intp_t[:] mask,
intp_t n_a,
intp_t n_b
):
"""Merge two IntFloatDicts with the average strategy: when the
same key is present in the two dicts, the weighted average of the two
values is used.
Parameters
==========
a, b : IntFloatDict object
The IntFloatDicts to merge
mask : ndarray array of dtype integer and of dimension 1
a mask for keys to ignore: if not mask[key] the corresponding key
is skipped in the output dictionary
n_a, n_b : float
n_a and n_b are weights for a and b for the merge strategy.
They are used for a weighted mean.
Returns
=======
out : IntFloatDict object
The IntFloatDict resulting from the merge
"""
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
cdef intp_t key
cdef float64_t value
cdef float64_t n_out = <float64_t> (n_a + n_b)
# First copy a into out
while a_it != a_end:
key = deref(a_it).first
if mask[key]:
out_obj.my_map[key] = deref(a_it).second
inc(a_it)
# Then merge b into out
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
while b_it != b_end:
key = deref(b_it).first
value = deref(b_it).second
if mask[key]:
out_it = out_obj.my_map.find(key)
if out_it == out_end:
# Key not found
out_obj.my_map[key] = value
else:
deref(out_it).second = (n_a * deref(out_it).second
+ n_b * value) / n_out
inc(b_it)
return out_obj
###############################################################################
# An edge object for fast comparisons
cdef class WeightedEdge:
cdef public intp_t a
cdef public intp_t b
cdef public float64_t weight
def __init__(self, float64_t weight, intp_t a, intp_t b):
self.weight = weight
self.a = a
self.b = b
def __richcmp__(self, WeightedEdge other, int op):
"""Cython-specific comparison method.
op is the comparison code::
< 0
== 2
> 4
<= 1
!= 3
>= 5
"""
if op == 0:
return self.weight < other.weight
elif op == 1:
return self.weight <= other.weight
elif op == 2:
return self.weight == other.weight
elif op == 3:
return self.weight != other.weight
elif op == 4:
return self.weight > other.weight
elif op == 5:
return self.weight >= other.weight
def __repr__(self):
return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
self.weight,
self.a, self.b)
################################################################################
# Efficient labelling/conversion of MSTs to single linkage hierarchies
cdef class UnionFind(object):
def __init__(self, N):
self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C')
self.next_label = N
self.size = np.hstack((np.ones(N, dtype=np.intp),
np.zeros(N - 1, dtype=np.intp)))
cdef void union(self, intp_t m, intp_t n) noexcept:
self.parent[m] = self.next_label
self.parent[n] = self.next_label
self.size[self.next_label] = self.size[m] + self.size[n]
self.next_label += 1
return
@cython.wraparound(True)
cdef intp_t fast_find(self, intp_t n) noexcept:
cdef intp_t p
p = n
# find the highest node in the linkage graph so far
while self.parent[n] != -1:
n = self.parent[n]
# provide a shortcut up to the highest node
while self.parent[p] != n:
p, self.parent[p] = self.parent[p], n
return n
def _single_linkage_label(const float64_t[:, :] L):
"""
Convert an linkage array or MST to a tree by labelling clusters at merges.
This is done by using a Union find structure to keep track of merges
efficiently. This is the private version of the function that assumes that
``L`` has been properly validated. See ``single_linkage_label`` for the
user facing version of this function.
Parameters
----------
L: array of shape (n_samples - 1, 3)
The linkage array or MST where each row specifies two samples
to be merged and a distance or weight at which the merge occurs. This
array is assumed to be sorted by the distance/weight.
Returns
-------
A tree in the format used by scipy.cluster.hierarchy.
"""
cdef float64_t[:, ::1] result_arr
cdef intp_t left, left_cluster, right, right_cluster, index
cdef float64_t delta
result_arr = np.zeros((L.shape[0], 4), dtype=np.float64)
U = UnionFind(L.shape[0] + 1)
for index in range(L.shape[0]):
left = <intp_t> L[index, 0]
right = <intp_t> L[index, 1]
delta = L[index, 2]
left_cluster = U.fast_find(left)
right_cluster = U.fast_find(right)
result_arr[index][0] = left_cluster
result_arr[index][1] = right_cluster
result_arr[index][2] = delta
result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster]
U.union(left_cluster, right_cluster)
return np.asarray(result_arr)
@cython.wraparound(True)
def single_linkage_label(L):
"""
Convert an linkage array or MST to a tree by labelling clusters at merges.
This is done by using a Union find structure to keep track of merges
efficiently.
Parameters
----------
L: array of shape (n_samples - 1, 3)
The linkage array or MST where each row specifies two samples
to be merged and a distance or weight at which the merge occurs. This
array is assumed to be sorted by the distance/weight.
Returns
-------
A tree in the format used by scipy.cluster.hierarchy.
"""
# Validate L
if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1:
raise ValueError("Input MST array is not a validly formatted MST array")
is_sorted = lambda x: np.all(x[:-1] <= x[1:])
if not is_sorted(L[:, 2]):
raise ValueError("Input MST array must be sorted by weight")
return _single_linkage_label(L)
# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
def mst_linkage_core(
const float64_t [:, ::1] raw_data,
DistanceMetric64 dist_metric):
"""
Compute the necessary elements of a minimum spanning
tree for computation of single linkage clustering. This
represents the MST-LINKAGE-CORE algorithm (Figure 6) from
:arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering
algorithms" <1109.2378>`.
In contrast to the scipy implementation is never computes
a full distance matrix, generating distances only as they
are needed and releasing them when no longer needed.
Parameters
----------
raw_data: array of shape (n_samples, n_features)
The array of feature data to be clustered. Must be C-aligned
dist_metric: DistanceMetric64
A DistanceMetric64 object conforming to the API from
``sklearn.metrics._dist_metrics.pxd`` that will be
used to compute distances.
Returns
-------
mst_core_data: array of shape (n_samples, 3)
An array providing information from which one
can either compute an MST, or the linkage hierarchy
very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical,
agglomerative clustering algorithms" <1109.2378>` algorithm
MST-LINKAGE-CORE for more details.
"""
cdef:
intp_t n_samples = raw_data.shape[0]
uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool)
float64_t[:, ::1] result = np.zeros((n_samples - 1, 3))
intp_t current_node = 0
intp_t new_node
intp_t i
intp_t j
intp_t num_features = raw_data.shape[1]
float64_t right_value
float64_t left_value
float64_t new_distance
float64_t[:] current_distances = np.full(n_samples, INFINITY)
for i in range(n_samples - 1):
in_tree[current_node] = 1
new_distance = INFINITY
new_node = 0
for j in range(n_samples):
if in_tree[j]:
continue
right_value = current_distances[j]
left_value = dist_metric.dist(&raw_data[current_node, 0],
&raw_data[j, 0],
num_features)
if left_value < right_value:
current_distances[j] = left_value
if current_distances[j] < new_distance:
new_distance = current_distances[j]
new_node = j
result[i, 0] = current_node
result[i, 1] = new_node
result[i, 2] = new_distance
current_node = new_node
return np.array(result)
@@ -0,0 +1,48 @@
from cython cimport floating
cdef floating _euclidean_dense_dense(
const floating*,
const floating*,
int,
bint
) noexcept nogil
cdef floating _euclidean_sparse_dense(
const floating[::1],
const int[::1],
const floating[::1],
floating,
bint
) noexcept nogil
cpdef void _relocate_empty_clusters_dense(
const floating[:, ::1],
const floating[::1],
const floating[:, ::1],
floating[:, ::1],
floating[::1],
const int[::1]
)
cpdef void _relocate_empty_clusters_sparse(
const floating[::1],
const int[::1],
const int[::1],
const floating[::1],
const floating[:, ::1],
floating[:, ::1],
floating[::1],
const int[::1]
)
cdef void _average_centers(
floating[:, ::1],
const floating[::1]
)
cdef void _center_shift(
const floating[:, ::1],
const floating[:, ::1],
floating[::1]
)
@@ -0,0 +1,331 @@
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Olivier Grisel <olivier.grisel@ensta.org>
# Lars Buitinck
#
# License: BSD 3 clause
import numpy as np
from cython cimport floating
from cython.parallel cimport prange
from libc.math cimport sqrt
from ..utils.extmath import row_norms
# Number of samples per data chunk defined as a global constant.
CHUNK_SIZE = 256
cdef floating _euclidean_dense_dense(
const floating* a, # IN
const floating* b, # IN
int n_features,
bint squared
) noexcept nogil:
"""Euclidean distance between a dense and b dense"""
cdef:
int i
int n = n_features // 4
int rem = n_features % 4
floating result = 0
# We manually unroll the loop for better cache optimization.
for i in range(n):
result += (
(a[0] - b[0]) * (a[0] - b[0]) +
(a[1] - b[1]) * (a[1] - b[1]) +
(a[2] - b[2]) * (a[2] - b[2]) +
(a[3] - b[3]) * (a[3] - b[3])
)
a += 4
b += 4
for i in range(rem):
result += (a[i] - b[i]) * (a[i] - b[i])
return result if squared else sqrt(result)
def _euclidean_dense_dense_wrapper(
const floating[::1] a,
const floating[::1] b,
bint squared
):
"""Wrapper of _euclidean_dense_dense for testing purpose"""
return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared)
cdef floating _euclidean_sparse_dense(
const floating[::1] a_data, # IN
const int[::1] a_indices, # IN
const floating[::1] b, # IN
floating b_squared_norm,
bint squared
) noexcept nogil:
"""Euclidean distance between a sparse and b dense"""
cdef:
int nnz = a_indices.shape[0]
int i
floating tmp, bi
floating result = 0.0
for i in range(nnz):
bi = b[a_indices[i]]
tmp = a_data[i] - bi
result += tmp * tmp - bi * bi
result += b_squared_norm
if result < 0:
result = 0.0
return result if squared else sqrt(result)
def _euclidean_sparse_dense_wrapper(
const floating[::1] a_data,
const int[::1] a_indices,
const floating[::1] b,
floating b_squared_norm,
bint squared
):
"""Wrapper of _euclidean_sparse_dense for testing purpose"""
return _euclidean_sparse_dense(
a_data, a_indices, b, b_squared_norm, squared)
cpdef floating _inertia_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers, # IN
const int[::1] labels, # IN
int n_threads,
int single_label=-1,
):
"""Compute inertia for dense input data
Sum of squared distance between each sample and its assigned center.
If single_label is >= 0, the inertia is computed only for that label.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int i, j
floating sq_dist = 0.0
floating inertia = 0.0
for i in prange(n_samples, nogil=True, num_threads=n_threads,
schedule='static'):
j = labels[i]
if single_label < 0 or single_label == j:
sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
n_features, True)
inertia += sq_dist * sample_weight[i]
return inertia
cpdef floating _inertia_sparse(
X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers, # IN
const int[::1] labels, # IN
int n_threads,
int single_label=-1,
):
"""Compute inertia for sparse input data
Sum of squared distance between each sample and its assigned center.
If single_label is >= 0, the inertia is computed only for that label.
"""
cdef:
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
int n_samples = X.shape[0]
int i, j
floating sq_dist = 0.0
floating inertia = 0.0
floating[::1] centers_squared_norms = row_norms(centers, squared=True)
for i in prange(n_samples, nogil=True, num_threads=n_threads,
schedule='static'):
j = labels[i]
if single_label < 0 or single_label == j:
sq_dist = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers[j], centers_squared_norms[j], True)
inertia += sq_dist * sample_weight[i]
return inertia
cpdef void _relocate_empty_clusters_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # INOUT
floating[::1] weight_in_clusters, # INOUT
const int[::1] labels # IN
):
"""Relocate centers which have no sample assigned to them."""
cdef:
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
int n_empty = empty_clusters.shape[0]
if n_empty == 0:
return
cdef:
int n_features = X.shape[1]
floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
int new_cluster_id, old_cluster_id, far_idx, idx, k
floating weight
if np.max(distances) == 0:
# Happens when there are more clusters than non-duplicate samples. Relocating
# is pointless in this case.
return
for idx in range(n_empty):
new_cluster_id = empty_clusters[idx]
far_idx = far_from_centers[idx]
weight = sample_weight[far_idx]
old_cluster_id = labels[far_idx]
for k in range(n_features):
centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
centers_new[new_cluster_id, k] = X[far_idx, k] * weight
weight_in_clusters[new_cluster_id] = weight
weight_in_clusters[old_cluster_id] -= weight
cpdef void _relocate_empty_clusters_sparse(
const floating[::1] X_data, # IN
const int[::1] X_indices, # IN
const int[::1] X_indptr, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # INOUT
floating[::1] weight_in_clusters, # INOUT
const int[::1] labels # IN
):
"""Relocate centers which have no sample assigned to them."""
cdef:
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
int n_empty = empty_clusters.shape[0]
if n_empty == 0:
return
cdef:
int n_samples = X_indptr.shape[0] - 1
int i, j, k
floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype)
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
for i in range(n_samples):
j = labels[i]
distances[i] = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers_old[j], centers_squared_norms[j], True)
if np.max(distances) == 0:
# Happens when there are more clusters than non-duplicate samples. Relocating
# is pointless in this case.
return
cdef:
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
int new_cluster_id, old_cluster_id, far_idx, idx
floating weight
for idx in range(n_empty):
new_cluster_id = empty_clusters[idx]
far_idx = far_from_centers[idx]
weight = sample_weight[far_idx]
old_cluster_id = labels[far_idx]
for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]):
centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight
centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight
weight_in_clusters[new_cluster_id] = weight
weight_in_clusters[old_cluster_id] -= weight
cdef void _average_centers(
floating[:, ::1] centers, # INOUT
const floating[::1] weight_in_clusters # IN
):
"""Average new centers wrt weights."""
cdef:
int n_clusters = centers.shape[0]
int n_features = centers.shape[1]
int j, k
floating alpha
int argmax_weight = np.argmax(weight_in_clusters)
for j in range(n_clusters):
if weight_in_clusters[j] > 0:
alpha = 1.0 / weight_in_clusters[j]
for k in range(n_features):
centers[j, k] *= alpha
else:
# For convenience, we avoid setting empty clusters at the origin but place
# them at the location of the biggest cluster.
for k in range(n_features):
centers[j, k] = centers[argmax_weight, k]
cdef void _center_shift(
const floating[:, ::1] centers_old, # IN
const floating[:, ::1] centers_new, # IN
floating[::1] center_shift # OUT
):
"""Compute shift between old and new centers."""
cdef:
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
int j
for j in range(n_clusters):
center_shift[j] = _euclidean_dense_dense(
&centers_new[j, 0], &centers_old[j, 0], n_features, False)
def _is_same_clustering(
const int[::1] labels1,
const int[::1] labels2,
n_clusters
):
"""Check if two arrays of labels are the same up to a permutation of the labels"""
cdef int[::1] mapping = np.full(fill_value=-1, shape=(n_clusters,), dtype=np.int32)
cdef int i
for i in range(labels1.shape[0]):
if mapping[labels1[i]] == -1:
mapping[labels1[i]] = labels2[i]
elif mapping[labels1[i]] != labels2[i]:
return False
return True
@@ -0,0 +1,687 @@
# Author: Andreas Mueller
#
# Licence: BSD 3 clause
from cython cimport floating
from cython.parallel import prange, parallel
from libc.stdlib cimport calloc, free
from libc.string cimport memset
from ..utils._openmp_helpers cimport omp_lock_t
from ..utils._openmp_helpers cimport omp_init_lock
from ..utils._openmp_helpers cimport omp_destroy_lock
from ..utils._openmp_helpers cimport omp_set_lock
from ..utils._openmp_helpers cimport omp_unset_lock
from ..utils.extmath import row_norms
from ._k_means_common import CHUNK_SIZE
from ._k_means_common cimport _relocate_empty_clusters_dense
from ._k_means_common cimport _relocate_empty_clusters_sparse
from ._k_means_common cimport _euclidean_dense_dense
from ._k_means_common cimport _euclidean_sparse_dense
from ._k_means_common cimport _average_centers
from ._k_means_common cimport _center_shift
def init_bounds_dense(
const floating[:, ::1] X, # IN
const floating[:, ::1] centers, # IN
const floating[:, ::1] center_half_distances, # IN
int[::1] labels, # OUT
floating[::1] upper_bounds, # OUT
floating[:, ::1] lower_bounds, # OUT
int n_threads):
"""Initialize upper and lower bounds for each sample for dense input data.
Given X, centers and the pairwise distances divided by 2.0 between the
centers this calculates the upper bounds and lower bounds for each sample.
The upper bound for each sample is set to the distance between the sample
and the closest center.
The lower bound for each sample is a one-dimensional array of n_clusters.
For each sample i assume that the previously assigned cluster is c1 and the
previous closest distance is dist, for a new cluster c2, the
lower_bound[i][c2] is set to distance between the sample and this new
cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
computation of unnecessary distances for each sample to the clusters that
it is unlikely to be assigned to.
Parameters
----------
X : ndarray of shape (n_samples, n_features), dtype=floating
The input data.
centers : ndarray of shape (n_clusters, n_features), dtype=floating
The cluster centers.
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
dtype=floating
The half of the distance between any 2 clusters centers.
labels : ndarray of shape(n_samples), dtype=int
The label for each sample. This array is modified in place.
upper_bounds : ndarray of shape(n_samples,), dtype=floating
The upper bound on the distance between each sample and its closest
cluster center. This array is modified in place.
lower_bounds : ndarray, of shape(n_samples, n_clusters), dtype=floating
The lower bound on the distance between each sample and each cluster
center. This array is modified in place.
n_threads : int
The number of threads to be used by openmp.
"""
cdef:
int n_samples = X.shape[0]
int n_clusters = centers.shape[0]
int n_features = X.shape[1]
floating min_dist, dist
int best_cluster, i, j
for i in prange(
n_samples, num_threads=n_threads, schedule='static', nogil=True
):
best_cluster = 0
min_dist = _euclidean_dense_dense(&X[i, 0], &centers[0, 0],
n_features, False)
lower_bounds[i, 0] = min_dist
for j in range(1, n_clusters):
if min_dist > center_half_distances[best_cluster, j]:
dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
n_features, False)
lower_bounds[i, j] = dist
if dist < min_dist:
min_dist = dist
best_cluster = j
labels[i] = best_cluster
upper_bounds[i] = min_dist
def init_bounds_sparse(
X, # IN
const floating[:, ::1] centers, # IN
const floating[:, ::1] center_half_distances, # IN
int[::1] labels, # OUT
floating[::1] upper_bounds, # OUT
floating[:, ::1] lower_bounds, # OUT
int n_threads):
"""Initialize upper and lower bounds for each sample for sparse input data.
Given X, centers and the pairwise distances divided by 2.0 between the
centers this calculates the upper bounds and lower bounds for each sample.
The upper bound for each sample is set to the distance between the sample
and the closest center.
The lower bound for each sample is a one-dimensional array of n_clusters.
For each sample i assume that the previously assigned cluster is c1 and the
previous closest distance is dist, for a new cluster c2, the
lower_bound[i][c2] is set to distance between the sample and this new
cluster, if and only if dist > center_half_distances[c1][c2]. This prevents
computation of unnecessary distances for each sample to the clusters that
it is unlikely to be assigned to.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features), dtype=floating
The input data. Must be in CSR format.
centers : ndarray of shape (n_clusters, n_features), dtype=floating
The cluster centers.
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
dtype=floating
The half of the distance between any 2 clusters centers.
labels : ndarray of shape(n_samples), dtype=int
The label for each sample. This array is modified in place.
upper_bounds : ndarray of shape(n_samples,), dtype=floating
The upper bound on the distance between each sample and its closest
cluster center. This array is modified in place.
lower_bounds : ndarray of shape(n_samples, n_clusters), dtype=floating
The lower bound on the distance between each sample and each cluster
center. This array is modified in place.
n_threads : int
The number of threads to be used by openmp.
"""
cdef:
int n_samples = X.shape[0]
int n_clusters = centers.shape[0]
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
floating min_dist, dist
int best_cluster, i, j
floating[::1] centers_squared_norms = row_norms(centers, squared=True)
for i in prange(
n_samples, num_threads=n_threads, schedule='static', nogil=True
):
best_cluster = 0
min_dist = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers[0], centers_squared_norms[0], False)
lower_bounds[i, 0] = min_dist
for j in range(1, n_clusters):
if min_dist > center_half_distances[best_cluster, j]:
dist = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers[j], centers_squared_norms[j], False)
lower_bounds[i, j] = dist
if dist < min_dist:
min_dist = dist
best_cluster = j
labels[i] = best_cluster
upper_bounds[i] = min_dist
def elkan_iter_chunked_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
const floating[:, ::1] center_half_distances, # IN
const floating[::1] distance_next_center, # IN
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
int[::1] labels, # INOUT
floating[::1] center_shift, # OUT
int n_threads,
bint update_centers=True):
"""Single iteration of K-means Elkan algorithm with dense input.
Update labels and centers (inplace), for one iteration, distributed
over data chunks.
Parameters
----------
X : ndarray of shape (n_samples, n_features), dtype=floating
The observations to cluster.
sample_weight : ndarray of shape (n_samples,), dtype=floating
The weights for each observation in X.
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
Centers before previous iteration, placeholder for the centers after
previous iteration.
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
Centers after previous iteration, placeholder for the new centers
computed during this iteration.
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
Placeholder for the sums of the weights of every observation assigned
to each center.
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
dtype=floating
Half pairwise distances between centers.
distance_next_center : ndarray of shape (n_clusters,), dtype=floating
Distance between each center its closest center.
upper_bounds : ndarray of shape (n_samples,), dtype=floating
Upper bound for the distance between each sample and its center,
updated inplace.
lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
Lower bound for the distance between each sample and each center,
updated inplace.
labels : ndarray of shape (n_samples,), dtype=int
labels assignment.
center_shift : ndarray of shape (n_clusters,), dtype=floating
Distance between old and new centers.
n_threads : int
The number of threads to be used by openmp.
update_centers : bool
- If True, the labels and the new centers will be computed, i.e. runs
the E-step and the M-step of the algorithm.
- If False, only the labels will be computed, i.e runs the E-step of
the algorithm. This is useful especially when calling predict on a
fitted model.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int n_clusters = centers_new.shape[0]
if n_samples == 0:
# An empty array was passed, do nothing and return early (before
# attempting to compute n_chunks). This can typically happen when
# calling the prediction function of a bisecting k-means model with a
# large fraction of outiers.
return
cdef:
# hard-coded number of samples per chunk. Splitting in chunks is
# necessary to get parallelism. Chunk size chosen to be same as lloyd's
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx
int start, end
int i, j, k
floating *centers_new_chunk
floating *weight_in_clusters_chunk
omp_lock_t lock
# count remainder chunk in total number of chunks
n_chunks += n_samples != n_chunks * n_samples_chunk
# number of threads should not be bigger than number of chunks
n_threads = min(n_threads, n_chunks)
if update_centers:
memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
omp_init_lock(&lock)
with nogil, parallel(num_threads=n_threads):
# thread local buffers
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
for chunk_idx in prange(n_chunks, schedule='static'):
start = chunk_idx * n_samples_chunk
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
end = start + n_samples_rem
else:
end = start + n_samples_chunk
_update_chunk_dense(
X[start: end],
sample_weight[start: end],
centers_old,
center_half_distances,
distance_next_center,
labels[start: end],
upper_bounds[start: end],
lower_bounds[start: end],
centers_new_chunk,
weight_in_clusters_chunk,
update_centers)
# reduction from local buffers.
if update_centers:
# The lock is necessary to avoid race conditions when aggregating
# info from different thread-local buffers.
omp_set_lock(&lock)
for j in range(n_clusters):
weight_in_clusters[j] += weight_in_clusters_chunk[j]
for k in range(n_features):
centers_new[j, k] += centers_new_chunk[j * n_features + k]
omp_unset_lock(&lock)
free(centers_new_chunk)
free(weight_in_clusters_chunk)
if update_centers:
omp_destroy_lock(&lock)
_relocate_empty_clusters_dense(X, sample_weight, centers_old,
centers_new, weight_in_clusters, labels)
_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
# update lower and upper bounds
for i in range(n_samples):
upper_bounds[i] += center_shift[labels[i]]
for j in range(n_clusters):
lower_bounds[i, j] -= center_shift[j]
if lower_bounds[i, j] < 0:
lower_bounds[i, j] = 0
cdef void _update_chunk_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
const floating[:, ::1] center_half_distances, # IN
const floating[::1] distance_next_center, # IN
int[::1] labels, # INOUT
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
floating *centers_new, # OUT
floating *weight_in_clusters, # OUT
bint update_centers) noexcept nogil:
"""K-means combined EM step for one dense data chunk.
Compute the partial contribution of a single data chunk to the labels and
centers.
"""
cdef:
int n_samples = labels.shape[0]
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
floating upper_bound, distance
int i, j, k, label
for i in range(n_samples):
upper_bound = upper_bounds[i]
bounds_tight = 0
label = labels[i]
# Next center is not far away from the currently assigned center.
# Sample might need to be assigned to another center.
if not distance_next_center[label] >= upper_bound:
for j in range(n_clusters):
# If this holds, then center_index is a good candidate for the
# sample to be relabelled, and we need to confirm this by
# recomputing the upper and lower bounds.
if (
j != label
and (upper_bound > lower_bounds[i, j])
and (upper_bound > center_half_distances[label, j])
):
# Recompute upper bound by calculating the actual distance
# between the sample and its current assigned center.
if not bounds_tight:
upper_bound = _euclidean_dense_dense(
&X[i, 0], &centers_old[label, 0], n_features, False)
lower_bounds[i, label] = upper_bound
bounds_tight = 1
# If the condition still holds, then compute the actual
# distance between the sample and center. If this is less
# than the previous distance, reassign label.
if (
upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])
):
distance = _euclidean_dense_dense(
&X[i, 0], &centers_old[j, 0], n_features, False)
lower_bounds[i, j] = distance
if distance < upper_bound:
label = j
upper_bound = distance
labels[i] = label
upper_bounds[i] = upper_bound
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(n_features):
centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
def elkan_iter_chunked_sparse(
X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
const floating[:, ::1] center_half_distances, # IN
const floating[::1] distance_next_center, # IN
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
int[::1] labels, # INOUT
floating[::1] center_shift, # OUT
int n_threads,
bint update_centers=True):
"""Single iteration of K-means Elkan algorithm with sparse input.
Update labels and centers (inplace), for one iteration, distributed
over data chunks.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
The observations to cluster. Must be in CSR format.
sample_weight : ndarray of shape (n_samples,), dtype=floating
The weights for each observation in X.
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
Centers before previous iteration, placeholder for the centers after
previous iteration.
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
Centers after previous iteration, placeholder for the new centers
computed during this iteration.
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
Placeholder for the sums of the weights of every observation assigned
to each center.
center_half_distances : ndarray of shape (n_clusters, n_clusters), \
dtype=floating
Half pairwise distances between centers.
distance_next_center : ndarray of shape (n_clusters,), dtype=floating
Distance between each center its closest center.
upper_bounds : ndarray of shape (n_samples,), dtype=floating
Upper bound for the distance between each sample and its center,
updated inplace.
lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating
Lower bound for the distance between each sample and each center,
updated inplace.
labels : ndarray of shape (n_samples,), dtype=int
labels assignment.
center_shift : ndarray of shape (n_clusters,), dtype=floating
Distance between old and new centers.
n_threads : int
The number of threads to be used by openmp.
update_centers : bool
- If True, the labels and the new centers will be computed, i.e. runs
the E-step and the M-step of the algorithm.
- If False, only the labels will be computed, i.e runs the E-step of
the algorithm. This is useful especially when calling predict on a
fitted model.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int n_clusters = centers_new.shape[0]
if n_samples == 0:
# An empty array was passed, do nothing and return early (before
# attempting to compute n_chunks). This can typically happen when
# calling the prediction function of a bisecting k-means model with a
# large fraction of outiers.
return
cdef:
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
# hard-coded number of samples per chunk. Splitting in chunks is
# necessary to get parallelism. Chunk size chosen to be same as lloyd's
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx
int start, end
int i, j, k
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
floating *centers_new_chunk
floating *weight_in_clusters_chunk
omp_lock_t lock
# count remainder chunk in total number of chunks
n_chunks += n_samples != n_chunks * n_samples_chunk
# number of threads should not be bigger than number of chunks
n_threads = min(n_threads, n_chunks)
if update_centers:
memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
omp_init_lock(&lock)
with nogil, parallel(num_threads=n_threads):
# thread local buffers
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
for chunk_idx in prange(n_chunks, schedule='static'):
start = chunk_idx * n_samples_chunk
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
end = start + n_samples_rem
else:
end = start + n_samples_chunk
_update_chunk_sparse(
X_data[X_indptr[start]: X_indptr[end]],
X_indices[X_indptr[start]: X_indptr[end]],
X_indptr[start: end+1],
sample_weight[start: end],
centers_old,
centers_squared_norms,
center_half_distances,
distance_next_center,
labels[start: end],
upper_bounds[start: end],
lower_bounds[start: end],
centers_new_chunk,
weight_in_clusters_chunk,
update_centers)
# reduction from local buffers.
if update_centers:
# The lock is necessary to avoid race conditions when aggregating
# info from different thread-local buffers.
omp_set_lock(&lock)
for j in range(n_clusters):
weight_in_clusters[j] += weight_in_clusters_chunk[j]
for k in range(n_features):
centers_new[j, k] += centers_new_chunk[j * n_features + k]
omp_unset_lock(&lock)
free(centers_new_chunk)
free(weight_in_clusters_chunk)
if update_centers:
omp_destroy_lock(&lock)
_relocate_empty_clusters_sparse(
X_data, X_indices, X_indptr, sample_weight,
centers_old, centers_new, weight_in_clusters, labels)
_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
# update lower and upper bounds
for i in range(n_samples):
upper_bounds[i] += center_shift[labels[i]]
for j in range(n_clusters):
lower_bounds[i, j] -= center_shift[j]
if lower_bounds[i, j] < 0:
lower_bounds[i, j] = 0
cdef void _update_chunk_sparse(
const floating[::1] X_data, # IN
const int[::1] X_indices, # IN
const int[::1] X_indptr, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
const floating[::1] centers_squared_norms, # IN
const floating[:, ::1] center_half_distances, # IN
const floating[::1] distance_next_center, # IN
int[::1] labels, # INOUT
floating[::1] upper_bounds, # INOUT
floating[:, ::1] lower_bounds, # INOUT
floating *centers_new, # OUT
floating *weight_in_clusters, # OUT
bint update_centers) noexcept nogil:
"""K-means combined EM step for one sparse data chunk.
Compute the partial contribution of a single data chunk to the labels and
centers.
"""
cdef:
int n_samples = labels.shape[0]
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
floating upper_bound, distance
int i, j, k, label
int s = X_indptr[0]
for i in range(n_samples):
upper_bound = upper_bounds[i]
bounds_tight = 0
label = labels[i]
# Next center is not far away from the currently assigned center.
# Sample might need to be assigned to another center.
if not distance_next_center[label] >= upper_bound:
for j in range(n_clusters):
# If this holds, then center_index is a good candidate for the
# sample to be relabelled, and we need to confirm this by
# recomputing the upper and lower bounds.
if (
j != label
and (upper_bound > lower_bounds[i, j])
and (upper_bound > center_half_distances[label, j])
):
# Recompute upper bound by calculating the actual distance
# between the sample and its current assigned center.
if not bounds_tight:
upper_bound = _euclidean_sparse_dense(
X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
centers_old[label], centers_squared_norms[label], False)
lower_bounds[i, label] = upper_bound
bounds_tight = 1
# If the condition still holds, then compute the actual
# distance between the sample and center. If this is less
# than the previous distance, reassign label.
if (
upper_bound > lower_bounds[i, j]
or (upper_bound > center_half_distances[label, j])
):
distance = _euclidean_sparse_dense(
X_data[X_indptr[i] - s: X_indptr[i + 1] - s],
X_indices[X_indptr[i] - s: X_indptr[i + 1] - s],
centers_old[j], centers_squared_norms[j], False)
lower_bounds[i, j] = distance
if distance < upper_bound:
label = j
upper_bound = distance
labels[i] = label
upper_bounds[i] = upper_bound
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
@@ -0,0 +1,420 @@
# Licence: BSD 3 clause
from cython cimport floating
from cython.parallel import prange, parallel
from libc.stdlib cimport malloc, calloc, free
from libc.string cimport memset
from libc.float cimport DBL_MAX, FLT_MAX
from ..utils._openmp_helpers cimport omp_lock_t
from ..utils._openmp_helpers cimport omp_init_lock
from ..utils._openmp_helpers cimport omp_destroy_lock
from ..utils._openmp_helpers cimport omp_set_lock
from ..utils._openmp_helpers cimport omp_unset_lock
from ..utils.extmath import row_norms
from ..utils._cython_blas cimport _gemm
from ..utils._cython_blas cimport RowMajor, Trans, NoTrans
from ._k_means_common import CHUNK_SIZE
from ._k_means_common cimport _relocate_empty_clusters_dense
from ._k_means_common cimport _relocate_empty_clusters_sparse
from ._k_means_common cimport _average_centers, _center_shift
def lloyd_iter_chunked_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
int[::1] labels, # OUT
floating[::1] center_shift, # OUT
int n_threads,
bint update_centers=True):
"""Single iteration of K-means lloyd algorithm with dense input.
Update labels and centers (inplace), for one iteration, distributed
over data chunks.
Parameters
----------
X : ndarray of shape (n_samples, n_features), dtype=floating
The observations to cluster.
sample_weight : ndarray of shape (n_samples,), dtype=floating
The weights for each observation in X.
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
Centers before previous iteration, placeholder for the centers after
previous iteration.
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
Centers after previous iteration, placeholder for the new centers
computed during this iteration. `centers_new` can be `None` if
`update_centers` is False.
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
Placeholder for the sums of the weights of every observation assigned
to each center. `weight_in_clusters` can be `None` if `update_centers`
is False.
labels : ndarray of shape (n_samples,), dtype=int
labels assignment.
center_shift : ndarray of shape (n_clusters,), dtype=floating
Distance between old and new centers.
n_threads : int
The number of threads to be used by openmp.
update_centers : bool
- If True, the labels and the new centers will be computed, i.e. runs
the E-step and the M-step of the algorithm.
- If False, only the labels will be computed, i.e runs the E-step of
the algorithm. This is useful especially when calling predict on a
fitted model.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int n_clusters = centers_old.shape[0]
if n_samples == 0:
# An empty array was passed, do nothing and return early (before
# attempting to compute n_chunks). This can typically happen when
# calling the prediction function of a bisecting k-means model with a
# large fraction of outiers.
return
cdef:
# hard-coded number of samples per chunk. Appeared to be close to
# optimal in all situations.
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx
int start, end
int j, k
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
floating *centers_new_chunk
floating *weight_in_clusters_chunk
floating *pairwise_distances_chunk
omp_lock_t lock
# count remainder chunk in total number of chunks
n_chunks += n_samples != n_chunks * n_samples_chunk
# number of threads should not be bigger than number of chunks
n_threads = min(n_threads, n_chunks)
if update_centers:
memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
omp_init_lock(&lock)
with nogil, parallel(num_threads=n_threads):
# thread local buffers
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
pairwise_distances_chunk = <floating*> malloc(n_samples_chunk * n_clusters * sizeof(floating))
for chunk_idx in prange(n_chunks, schedule='static'):
start = chunk_idx * n_samples_chunk
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
end = start + n_samples_rem
else:
end = start + n_samples_chunk
_update_chunk_dense(
X[start: end],
sample_weight[start: end],
centers_old,
centers_squared_norms,
labels[start: end],
centers_new_chunk,
weight_in_clusters_chunk,
pairwise_distances_chunk,
update_centers)
# reduction from local buffers.
if update_centers:
# The lock is necessary to avoid race conditions when aggregating
# info from different thread-local buffers.
omp_set_lock(&lock)
for j in range(n_clusters):
weight_in_clusters[j] += weight_in_clusters_chunk[j]
for k in range(n_features):
centers_new[j, k] += centers_new_chunk[j * n_features + k]
omp_unset_lock(&lock)
free(centers_new_chunk)
free(weight_in_clusters_chunk)
free(pairwise_distances_chunk)
if update_centers:
omp_destroy_lock(&lock)
_relocate_empty_clusters_dense(
X, sample_weight, centers_old, centers_new, weight_in_clusters, labels
)
_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
cdef void _update_chunk_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
const floating[::1] centers_squared_norms, # IN
int[::1] labels, # OUT
floating *centers_new, # OUT
floating *weight_in_clusters, # OUT
floating *pairwise_distances, # OUT
bint update_centers) noexcept nogil:
"""K-means combined EM step for one dense data chunk.
Compute the partial contribution of a single data chunk to the labels and
centers.
"""
cdef:
int n_samples = labels.shape[0]
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
floating sq_dist, min_sq_dist
int i, j, k, label
# Instead of computing the full pairwise squared distances matrix,
# ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store
# the - 2 X.C^T + ||C||² term since the argmin for a given sample only
# depends on the centers.
# pairwise_distances = ||C||²
for i in range(n_samples):
for j in range(n_clusters):
pairwise_distances[i * n_clusters + j] = centers_squared_norms[j]
# pairwise_distances += -2 * X.dot(C.T)
_gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
-2.0, &X[0, 0], n_features, &centers_old[0, 0], n_features,
1.0, pairwise_distances, n_clusters)
for i in range(n_samples):
min_sq_dist = pairwise_distances[i * n_clusters]
label = 0
for j in range(1, n_clusters):
sq_dist = pairwise_distances[i * n_clusters + j]
if sq_dist < min_sq_dist:
min_sq_dist = sq_dist
label = j
labels[i] = label
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(n_features):
centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
def lloyd_iter_chunked_sparse(
X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
int[::1] labels, # OUT
floating[::1] center_shift, # OUT
int n_threads,
bint update_centers=True):
"""Single iteration of K-means lloyd algorithm with sparse input.
Update labels and centers (inplace), for one iteration, distributed
over data chunks.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features), dtype=floating
The observations to cluster. Must be in CSR format.
sample_weight : ndarray of shape (n_samples,), dtype=floating
The weights for each observation in X.
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
Centers before previous iteration, placeholder for the centers after
previous iteration.
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
Centers after previous iteration, placeholder for the new centers
computed during this iteration. `centers_new` can be `None` if
`update_centers` is False.
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
Placeholder for the sums of the weights of every observation assigned
to each center. `weight_in_clusters` can be `None` if `update_centers`
is False.
labels : ndarray of shape (n_samples,), dtype=int
labels assignment.
center_shift : ndarray of shape (n_clusters,), dtype=floating
Distance between old and new centers.
n_threads : int
The number of threads to be used by openmp.
update_centers : bool
- If True, the labels and the new centers will be computed, i.e. runs
the E-step and the M-step of the algorithm.
- If False, only the labels will be computed, i.e runs the E-step of
the algorithm. This is useful especially when calling predict on a
fitted model.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int n_clusters = centers_old.shape[0]
if n_samples == 0:
# An empty array was passed, do nothing and return early (before
# attempting to compute n_chunks). This can typically happen when
# calling the prediction function of a bisecting k-means model with a
# large fraction of outiers.
return
cdef:
# Choose same as for dense. Does not have the same impact since with
# sparse data the pairwise distances matrix is not precomputed.
# However, splitting in chunks is necessary to get parallelism.
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx
int start = 0, end = 0
int j, k
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
floating *centers_new_chunk
floating *weight_in_clusters_chunk
omp_lock_t lock
# count remainder chunk in total number of chunks
n_chunks += n_samples != n_chunks * n_samples_chunk
# number of threads should not be bigger than number of chunks
n_threads = min(n_threads, n_chunks)
if update_centers:
memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
omp_init_lock(&lock)
with nogil, parallel(num_threads=n_threads):
# thread local buffers
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
for chunk_idx in prange(n_chunks, schedule='static'):
start = chunk_idx * n_samples_chunk
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
end = start + n_samples_rem
else:
end = start + n_samples_chunk
_update_chunk_sparse(
X_data[X_indptr[start]: X_indptr[end]],
X_indices[X_indptr[start]: X_indptr[end]],
X_indptr[start: end+1],
sample_weight[start: end],
centers_old,
centers_squared_norms,
labels[start: end],
centers_new_chunk,
weight_in_clusters_chunk,
update_centers)
# reduction from local buffers.
if update_centers:
# The lock is necessary to avoid race conditions when aggregating
# info from different thread-local buffers.
omp_set_lock(&lock)
for j in range(n_clusters):
weight_in_clusters[j] += weight_in_clusters_chunk[j]
for k in range(n_features):
centers_new[j, k] += centers_new_chunk[j * n_features + k]
omp_unset_lock(&lock)
free(centers_new_chunk)
free(weight_in_clusters_chunk)
if update_centers:
omp_destroy_lock(&lock)
_relocate_empty_clusters_sparse(
X_data, X_indices, X_indptr, sample_weight,
centers_old, centers_new, weight_in_clusters, labels)
_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
cdef void _update_chunk_sparse(
const floating[::1] X_data, # IN
const int[::1] X_indices, # IN
const int[::1] X_indptr, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
const floating[::1] centers_squared_norms, # IN
int[::1] labels, # OUT
floating *centers_new, # OUT
floating *weight_in_clusters, # OUT
bint update_centers) noexcept nogil:
"""K-means combined EM step for one sparse data chunk.
Compute the partial contribution of a single data chunk to the labels and
centers.
"""
cdef:
int n_samples = labels.shape[0]
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
floating sq_dist, min_sq_dist
int i, j, k, label
floating max_floating = FLT_MAX if floating is float else DBL_MAX
int s = X_indptr[0]
# XXX Precompute the pairwise distances matrix is not worth for sparse
# currently. Should be tested when BLAS (sparse x dense) matrix
# multiplication is available.
for i in range(n_samples):
min_sq_dist = max_floating
label = 0
for j in range(n_clusters):
sq_dist = 0.0
for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
sq_dist += centers_old[j, X_indices[k]] * X_data[k]
# Instead of computing the full squared distance with each cluster,
# ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute
# the - 2 X.C^T + ||C||² term since the argmin for a given sample
# only depends on the centers C.
sq_dist = centers_squared_norms[j] -2 * sq_dist
if sq_dist < min_sq_dist:
min_sq_dist = sq_dist
label = j
labels[i] = label
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
@@ -0,0 +1,218 @@
from cython cimport floating
from cython.parallel cimport parallel, prange
from libc.stdlib cimport malloc, free
def _minibatch_update_dense(
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_sums, # INOUT
const int[::1] labels, # IN
int n_threads):
"""Update of the centers for dense MiniBatchKMeans.
Parameters
----------
X : ndarray of shape (n_samples, n_features), dtype=floating
The observations to cluster.
sample_weight : ndarray of shape (n_samples,), dtype=floating
The weights for each observation in X.
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
Centers before previous iteration, placeholder for the centers after
previous iteration.
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
Centers after previous iteration, placeholder for the new centers
computed during this iteration.
weight_sums : ndarray of shape (n_clusters,), dtype=floating
Current sums of the accumulated weights for each center.
labels : ndarray of shape (n_samples,), dtype=int
labels assignment.
n_threads : int
The number of threads to be used by openmp.
"""
cdef:
int n_samples = X.shape[0]
int n_clusters = centers_old.shape[0]
int cluster_idx
int *indices
with nogil, parallel(num_threads=n_threads):
indices = <int*> malloc(n_samples * sizeof(int))
for cluster_idx in prange(n_clusters, schedule="static"):
update_center_dense(cluster_idx, X, sample_weight,
centers_old, centers_new, weight_sums, labels,
indices)
free(indices)
cdef void update_center_dense(
int cluster_idx,
const floating[:, ::1] X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_sums, # INOUT
const int[::1] labels, # IN
int *indices) noexcept nogil: # TMP
"""Update of a single center for dense MinibatchKMeans"""
cdef:
int n_samples = sample_weight.shape[0]
int n_features = centers_old.shape[1]
floating alpha
int n_indices
int k, sample_idx, feature_idx
floating wsum = 0
# indices = np.where(labels == cluster_idx)[0]
k = 0
for sample_idx in range(n_samples):
if labels[sample_idx] == cluster_idx:
indices[k] = sample_idx
wsum += sample_weight[sample_idx]
k += 1
n_indices = k
if wsum > 0:
# Undo the previous count-based scaling for this cluster center
for feature_idx in range(n_features):
centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
# Update cluster with new point members
for k in range(n_indices):
sample_idx = indices[k]
for feature_idx in range(n_features):
centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx]
# Update the count statistics for this center
weight_sums[cluster_idx] += wsum
# Rescale to compute mean of all points (old and new)
alpha = 1 / weight_sums[cluster_idx]
for feature_idx in range(n_features):
centers_new[cluster_idx, feature_idx] *= alpha
else:
# No sample was assigned to this cluster in this batch of data
for feature_idx in range(n_features):
centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
def _minibatch_update_sparse(
X, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_sums, # INOUT
const int[::1] labels, # IN
int n_threads):
"""Update of the centers for sparse MiniBatchKMeans.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features), dtype=floating
The observations to cluster. Must be in CSR format.
sample_weight : ndarray of shape (n_samples,), dtype=floating
The weights for each observation in X.
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
Centers before previous iteration, placeholder for the centers after
previous iteration.
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
Centers after previous iteration, placeholder for the new centers
computed during this iteration.
weight_sums : ndarray of shape (n_clusters,), dtype=floating
Current sums of the accumulated weights for each center.
labels : ndarray of shape (n_samples,), dtype=int
labels assignment.
n_threads : int
The number of threads to be used by openmp.
"""
cdef:
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
int n_samples = X.shape[0]
int n_clusters = centers_old.shape[0]
int cluster_idx
int *indices
with nogil, parallel(num_threads=n_threads):
indices = <int*> malloc(n_samples * sizeof(int))
for cluster_idx in prange(n_clusters, schedule="static"):
update_center_sparse(cluster_idx, X_data, X_indices, X_indptr,
sample_weight, centers_old, centers_new,
weight_sums, labels, indices)
free(indices)
cdef void update_center_sparse(
int cluster_idx,
const floating[::1] X_data, # IN
const int[::1] X_indices, # IN
const int[::1] X_indptr, # IN
const floating[::1] sample_weight, # IN
const floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_sums, # INOUT
const int[::1] labels, # IN
int *indices) noexcept nogil: # TMP
"""Update of a single center for sparse MinibatchKMeans"""
cdef:
int n_samples = sample_weight.shape[0]
int n_features = centers_old.shape[1]
floating alpha
int n_indices
int k, sample_idx, feature_idx
floating wsum = 0
# indices = np.where(labels == cluster_idx)[0]
k = 0
for sample_idx in range(n_samples):
if labels[sample_idx] == cluster_idx:
indices[k] = sample_idx
wsum += sample_weight[sample_idx]
k += 1
n_indices = k
if wsum > 0:
# Undo the previous count-based scaling for this cluster center:
for feature_idx in range(n_features):
centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
# Update cluster with new point members
for k in range(n_indices):
sample_idx = indices[k]
for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx]
# Update the count statistics for this center
weight_sums[cluster_idx] += wsum
# Rescale to compute mean of all points (old and new)
alpha = 1 / weight_sums[cluster_idx]
for feature_idx in range(n_features):
centers_new[cluster_idx, feature_idx] *= alpha
else:
# No sample was assigned to this cluster in this batch of data
for feature_idx in range(n_features):
centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,575 @@
"""Mean shift clustering algorithm.
Mean shift clustering aims to discover *blobs* in a smooth density of
samples. It is a centroid based algorithm, which works by updating candidates
for centroids to be the mean of the points within a given region. These
candidates are then filtered in a post-processing stage to eliminate
near-duplicates to form the final set of centroids.
Seeding is performed using a binning technique for scalability.
"""
# Authors: Conrad Lee <conradlee@gmail.com>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Martino Sorbaro <martino.sorbaro@ed.ac.uk>
import warnings
from collections import defaultdict
from numbers import Integral, Real
import numpy as np
from .._config import config_context
from ..base import BaseEstimator, ClusterMixin, _fit_context
from ..metrics.pairwise import pairwise_distances_argmin
from ..neighbors import NearestNeighbors
from ..utils import check_array, check_random_state, gen_batches
from ..utils._param_validation import Interval, validate_params
from ..utils.parallel import Parallel, delayed
from ..utils.validation import check_is_fitted
@validate_params(
{
"X": ["array-like"],
"quantile": [Interval(Real, 0, 1, closed="both")],
"n_samples": [Interval(Integral, 1, None, closed="left"), None],
"random_state": ["random_state"],
"n_jobs": [Integral, None],
},
prefer_skip_nested_validation=True,
)
def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0, n_jobs=None):
"""Estimate the bandwidth to use with the mean-shift algorithm.
This function takes time at least quadratic in `n_samples`. For large
datasets, it is wise to subsample by setting `n_samples`. Alternatively,
the parameter `bandwidth` can be set to a small value without estimating
it.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input points.
quantile : float, default=0.3
Should be between [0, 1]
0.5 means that the median of all pairwise distances is used.
n_samples : int, default=None
The number of samples to use. If not given, all samples are used.
random_state : int, RandomState instance, default=None
The generator used to randomly select the samples from input points
for bandwidth estimation. Use an int to make the randomness
deterministic.
See :term:`Glossary <random_state>`.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
bandwidth : float
The bandwidth parameter.
Examples
--------
>>> import numpy as np
>>> from sklearn.cluster import estimate_bandwidth
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> estimate_bandwidth(X, quantile=0.5)
1.61...
"""
X = check_array(X)
random_state = check_random_state(random_state)
if n_samples is not None:
idx = random_state.permutation(X.shape[0])[:n_samples]
X = X[idx]
n_neighbors = int(X.shape[0] * quantile)
if n_neighbors < 1: # cannot fit NearestNeighbors with n_neighbors = 0
n_neighbors = 1
nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=n_jobs)
nbrs.fit(X)
bandwidth = 0.0
for batch in gen_batches(len(X), 500):
d, _ = nbrs.kneighbors(X[batch, :], return_distance=True)
bandwidth += np.max(d, axis=1).sum()
return bandwidth / X.shape[0]
# separate function for each seed's iterative loop
def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
# For each seed, climb gradient until convergence or max_iter
bandwidth = nbrs.get_params()["radius"]
stop_thresh = 1e-3 * bandwidth # when mean has converged
completed_iterations = 0
while True:
# Find mean of points within bandwidth
i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0]
points_within = X[i_nbrs]
if len(points_within) == 0:
break # Depending on seeding strategy this condition may occur
my_old_mean = my_mean # save the old mean
my_mean = np.mean(points_within, axis=0)
# If converged or at max_iter, adds the cluster
if (
np.linalg.norm(my_mean - my_old_mean) <= stop_thresh
or completed_iterations == max_iter
):
break
completed_iterations += 1
return tuple(my_mean), len(points_within), completed_iterations
@validate_params(
{"X": ["array-like"]},
prefer_skip_nested_validation=False,
)
def mean_shift(
X,
*,
bandwidth=None,
seeds=None,
bin_seeding=False,
min_bin_freq=1,
cluster_all=True,
max_iter=300,
n_jobs=None,
):
"""Perform mean shift clustering of data using a flat kernel.
Read more in the :ref:`User Guide <mean_shift>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data.
bandwidth : float, default=None
Kernel bandwidth. If not None, must be in the range [0, +inf).
If None, the bandwidth is determined using a heuristic based on
the median of all pairwise distances. This will take quadratic time in
the number of samples. The sklearn.cluster.estimate_bandwidth function
can be used to do this more efficiently.
seeds : array-like of shape (n_seeds, n_features) or None
Point used as initial kernel locations. If None and bin_seeding=False,
each data point is used as a seed. If None and bin_seeding=True,
see bin_seeding.
bin_seeding : bool, default=False
If true, initial kernel locations are not locations of all
points, but rather the location of the discretized version of
points, where points are binned onto a grid whose coarseness
corresponds to the bandwidth. Setting this option to True will speed
up the algorithm because fewer seeds will be initialized.
Ignored if seeds argument is not None.
min_bin_freq : int, default=1
To speed up the algorithm, accept only those bins with at least
min_bin_freq points as seeds.
cluster_all : bool, default=True
If true, then all points are clustered, even those orphans that are
not within any kernel. Orphans are assigned to the nearest kernel.
If false, then orphans are given cluster label -1.
max_iter : int, default=300
Maximum number of iterations, per seed point before the clustering
operation terminates (for that seed point), if has not converged yet.
n_jobs : int, default=None
The number of jobs to use for the computation. The following tasks benefit
from the parallelization:
- The search of nearest neighbors for bandwidth estimation and label
assignments. See the details in the docstring of the
``NearestNeighbors`` class.
- Hill-climbing optimization for all seeds.
See :term:`Glossary <n_jobs>` for more details.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
.. versionadded:: 0.17
Parallel Execution using *n_jobs*.
Returns
-------
cluster_centers : ndarray of shape (n_clusters, n_features)
Coordinates of cluster centers.
labels : ndarray of shape (n_samples,)
Cluster labels for each point.
Notes
-----
For an example, see :ref:`examples/cluster/plot_mean_shift.py
<sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.
Examples
--------
>>> import numpy as np
>>> from sklearn.cluster import mean_shift
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> cluster_centers, labels = mean_shift(X, bandwidth=2)
>>> cluster_centers
array([[3.33..., 6. ],
[1.33..., 0.66...]])
>>> labels
array([1, 1, 1, 0, 0, 0])
"""
model = MeanShift(
bandwidth=bandwidth,
seeds=seeds,
min_bin_freq=min_bin_freq,
bin_seeding=bin_seeding,
cluster_all=cluster_all,
n_jobs=n_jobs,
max_iter=max_iter,
).fit(X)
return model.cluster_centers_, model.labels_
def get_bin_seeds(X, bin_size, min_bin_freq=1):
"""Find seeds for mean_shift.
Finds seeds by first binning data onto a grid whose lines are
spaced bin_size apart, and then choosing those bins with at least
min_bin_freq points.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input points, the same points that will be used in mean_shift.
bin_size : float
Controls the coarseness of the binning. Smaller values lead
to more seeding (which is computationally more expensive). If you're
not sure how to set this, set it to the value of the bandwidth used
in clustering.mean_shift.
min_bin_freq : int, default=1
Only bins with at least min_bin_freq will be selected as seeds.
Raising this value decreases the number of seeds found, which
makes mean_shift computationally cheaper.
Returns
-------
bin_seeds : array-like of shape (n_samples, n_features)
Points used as initial kernel positions in clustering.mean_shift.
"""
if bin_size == 0:
return X
# Bin points
bin_sizes = defaultdict(int)
for point in X:
binned_point = np.round(point / bin_size)
bin_sizes[tuple(binned_point)] += 1
# Select only those bins as seeds which have enough members
bin_seeds = np.array(
[point for point, freq in bin_sizes.items() if freq >= min_bin_freq],
dtype=np.float32,
)
if len(bin_seeds) == len(X):
warnings.warn(
"Binning data failed with provided bin_size=%f, using data points as seeds."
% bin_size
)
return X
bin_seeds = bin_seeds * bin_size
return bin_seeds
class MeanShift(ClusterMixin, BaseEstimator):
"""Mean shift clustering using a flat kernel.
Mean shift clustering aims to discover "blobs" in a smooth density of
samples. It is a centroid-based algorithm, which works by updating
candidates for centroids to be the mean of the points within a given
region. These candidates are then filtered in a post-processing stage to
eliminate near-duplicates to form the final set of centroids.
Seeding is performed using a binning technique for scalability.
Read more in the :ref:`User Guide <mean_shift>`.
Parameters
----------
bandwidth : float, default=None
Bandwidth used in the flat kernel.
If not given, the bandwidth is estimated using
sklearn.cluster.estimate_bandwidth; see the documentation for that
function for hints on scalability (see also the Notes, below).
seeds : array-like of shape (n_samples, n_features), default=None
Seeds used to initialize kernels. If not set,
the seeds are calculated by clustering.get_bin_seeds
with bandwidth as the grid size and default values for
other parameters.
bin_seeding : bool, default=False
If true, initial kernel locations are not locations of all
points, but rather the location of the discretized version of
points, where points are binned onto a grid whose coarseness
corresponds to the bandwidth. Setting this option to True will speed
up the algorithm because fewer seeds will be initialized.
The default value is False.
Ignored if seeds argument is not None.
min_bin_freq : int, default=1
To speed up the algorithm, accept only those bins with at least
min_bin_freq points as seeds.
cluster_all : bool, default=True
If true, then all points are clustered, even those orphans that are
not within any kernel. Orphans are assigned to the nearest kernel.
If false, then orphans are given cluster label -1.
n_jobs : int, default=None
The number of jobs to use for the computation. The following tasks benefit
from the parallelization:
- The search of nearest neighbors for bandwidth estimation and label
assignments. See the details in the docstring of the
``NearestNeighbors`` class.
- Hill-climbing optimization for all seeds.
See :term:`Glossary <n_jobs>` for more details.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_iter : int, default=300
Maximum number of iterations, per seed point before the clustering
operation terminates (for that seed point), if has not converged yet.
.. versionadded:: 0.22
Attributes
----------
cluster_centers_ : ndarray of shape (n_clusters, n_features)
Coordinates of cluster centers.
labels_ : ndarray of shape (n_samples,)
Labels of each point.
n_iter_ : int
Maximum number of iterations performed on each seed.
.. versionadded:: 0.22
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
KMeans : K-Means clustering.
Notes
-----
Scalability:
Because this implementation uses a flat kernel and
a Ball Tree to look up members of each kernel, the complexity will tend
towards O(T*n*log(n)) in lower dimensions, with n the number of samples
and T the number of points. In higher dimensions the complexity will
tend towards O(T*n^2).
Scalability can be boosted by using fewer seeds, for example by using
a higher value of min_bin_freq in the get_bin_seeds function.
Note that the estimate_bandwidth function is much less scalable than the
mean shift algorithm and will be the bottleneck if it is used.
References
----------
Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward
feature space analysis". IEEE Transactions on Pattern Analysis and
Machine Intelligence. 2002. pp. 603-619.
Examples
--------
>>> from sklearn.cluster import MeanShift
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = MeanShift(bandwidth=2).fit(X)
>>> clustering.labels_
array([1, 1, 1, 0, 0, 0])
>>> clustering.predict([[0, 0], [5, 5]])
array([1, 0])
>>> clustering
MeanShift(bandwidth=2)
"""
_parameter_constraints: dict = {
"bandwidth": [Interval(Real, 0, None, closed="neither"), None],
"seeds": ["array-like", None],
"bin_seeding": ["boolean"],
"min_bin_freq": [Interval(Integral, 1, None, closed="left")],
"cluster_all": ["boolean"],
"n_jobs": [Integral, None],
"max_iter": [Interval(Integral, 0, None, closed="left")],
}
def __init__(
self,
*,
bandwidth=None,
seeds=None,
bin_seeding=False,
min_bin_freq=1,
cluster_all=True,
n_jobs=None,
max_iter=300,
):
self.bandwidth = bandwidth
self.seeds = seeds
self.bin_seeding = bin_seeding
self.cluster_all = cluster_all
self.min_bin_freq = min_bin_freq
self.n_jobs = n_jobs
self.max_iter = max_iter
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Perform clustering.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples to cluster.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Fitted instance.
"""
X = self._validate_data(X)
bandwidth = self.bandwidth
if bandwidth is None:
bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
seeds = self.seeds
if seeds is None:
if self.bin_seeding:
seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)
else:
seeds = X
n_samples, n_features = X.shape
center_intensity_dict = {}
# We use n_jobs=1 because this will be used in nested calls under
# parallel calls to _mean_shift_single_seed so there is no need for
# for further parallelism.
nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
# execute iterations on all seeds in parallel
all_res = Parallel(n_jobs=self.n_jobs)(
delayed(_mean_shift_single_seed)(seed, X, nbrs, self.max_iter)
for seed in seeds
)
# copy results in a dictionary
for i in range(len(seeds)):
if all_res[i][1]: # i.e. len(points_within) > 0
center_intensity_dict[all_res[i][0]] = all_res[i][1]
self.n_iter_ = max([x[2] for x in all_res])
if not center_intensity_dict:
# nothing near seeds
raise ValueError(
"No point was within bandwidth=%f of any seed. Try a different seeding"
" strategy or increase the bandwidth."
% bandwidth
)
# POST PROCESSING: remove near duplicate points
# If the distance between two kernels is less than the bandwidth,
# then we have to remove one because it is a duplicate. Remove the
# one with fewer points.
sorted_by_intensity = sorted(
center_intensity_dict.items(),
key=lambda tup: (tup[1], tup[0]),
reverse=True,
)
sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
unique = np.ones(len(sorted_centers), dtype=bool)
nbrs = NearestNeighbors(radius=bandwidth, n_jobs=self.n_jobs).fit(
sorted_centers
)
for i, center in enumerate(sorted_centers):
if unique[i]:
neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[
0
]
unique[neighbor_idxs] = 0
unique[i] = 1 # leave the current point as unique
cluster_centers = sorted_centers[unique]
# ASSIGN LABELS: a point belongs to the cluster that it is closest to
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs).fit(cluster_centers)
labels = np.zeros(n_samples, dtype=int)
distances, idxs = nbrs.kneighbors(X)
if self.cluster_all:
labels = idxs.flatten()
else:
labels.fill(-1)
bool_selector = distances.flatten() <= bandwidth
labels[bool_selector] = idxs.flatten()[bool_selector]
self.cluster_centers_, self.labels_ = cluster_centers, labels
return self
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
Parameters
----------
X : array-like of shape (n_samples, n_features)
New data to predict.
Returns
-------
labels : ndarray of shape (n_samples,)
Index of the cluster each sample belongs to.
"""
check_is_fitted(self)
X = self._validate_data(X, reset=False)
with config_context(assume_finite=True):
return pairwise_distances_argmin(X, self.cluster_centers_)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,801 @@
"""Algorithms for spectral clustering"""
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
# Brian Cheung
# Wei LI <kuantkid@gmail.com>
# Andrew Knyazev <Andrew.Knyazev@ucdenver.edu>
# License: BSD 3 clause
import warnings
from numbers import Integral, Real
import numpy as np
from scipy.linalg import LinAlgError, qr, svd
from scipy.sparse import csc_matrix
from ..base import BaseEstimator, ClusterMixin, _fit_context
from ..manifold._spectral_embedding import _spectral_embedding
from ..metrics.pairwise import KERNEL_PARAMS, pairwise_kernels
from ..neighbors import NearestNeighbors, kneighbors_graph
from ..utils import as_float_array, check_random_state
from ..utils._param_validation import Interval, StrOptions, validate_params
from ._kmeans import k_means
def cluster_qr(vectors):
"""Find the discrete partition closest to the eigenvector embedding.
This implementation was proposed in [1]_.
.. versionadded:: 1.1
Parameters
----------
vectors : array-like, shape: (n_samples, n_clusters)
The embedding space of the samples.
Returns
-------
labels : array of integers, shape: n_samples
The cluster labels of vectors.
References
----------
.. [1] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
Anil Damle, Victor Minden, Lexing Ying
<10.1093/imaiai/iay008>`
"""
k = vectors.shape[1]
_, _, piv = qr(vectors.T, pivoting=True)
ut, _, v = svd(vectors[piv[:k], :].T)
vectors = abs(np.dot(vectors, np.dot(ut, v.conj())))
return vectors.argmax(axis=1)
def discretize(
vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None
):
"""Search for a partition matrix which is closest to the eigenvector embedding.
This implementation was proposed in [1]_.
Parameters
----------
vectors : array-like of shape (n_samples, n_clusters)
The embedding space of the samples.
copy : bool, default=True
Whether to copy vectors, or perform in-place normalization.
max_svd_restarts : int, default=30
Maximum number of attempts to restart SVD if convergence fails
n_iter_max : int, default=30
Maximum number of iterations to attempt in rotation and partition
matrix search if machine precision convergence is not reached
random_state : int, RandomState instance, default=None
Determines random number generation for rotation matrix initialization.
Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Returns
-------
labels : array of integers, shape: n_samples
The labels of the clusters.
References
----------
.. [1] `Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
Notes
-----
The eigenvector embedding is used to iteratively search for the
closest discrete partition. First, the eigenvector embedding is
normalized to the space of partition matrices. An optimal discrete
partition matrix closest to this normalized embedding multiplied by
an initial rotation is calculated. Fixing this discrete partition
matrix, an optimal rotation matrix is calculated. These two
calculations are performed until convergence. The discrete partition
matrix is returned as the clustering solution. Used in spectral
clustering, this method tends to be faster and more robust to random
initialization than k-means.
"""
random_state = check_random_state(random_state)
vectors = as_float_array(vectors, copy=copy)
eps = np.finfo(float).eps
n_samples, n_components = vectors.shape
# Normalize the eigenvectors to an equal length of a vector of ones.
# Reorient the eigenvectors to point in the negative direction with respect
# to the first element. This may have to do with constraining the
# eigenvectors to lie in a specific quadrant to make the discretization
# search easier.
norm_ones = np.sqrt(n_samples)
for i in range(vectors.shape[1]):
vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) * norm_ones
if vectors[0, i] != 0:
vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])
# Normalize the rows of the eigenvectors. Samples should lie on the unit
# hypersphere centered at the origin. This transforms the samples in the
# embedding space to the space of partition matrices.
vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis]
svd_restarts = 0
has_converged = False
# If there is an exception we try to randomize and rerun SVD again
# do this max_svd_restarts times.
while (svd_restarts < max_svd_restarts) and not has_converged:
# Initialize first column of rotation matrix with a row of the
# eigenvectors
rotation = np.zeros((n_components, n_components))
rotation[:, 0] = vectors[random_state.randint(n_samples), :].T
# To initialize the rest of the rotation matrix, find the rows
# of the eigenvectors that are as orthogonal to each other as
# possible
c = np.zeros(n_samples)
for j in range(1, n_components):
# Accumulate c to ensure row is as orthogonal as possible to
# previous picks as well as current one
c += np.abs(np.dot(vectors, rotation[:, j - 1]))
rotation[:, j] = vectors[c.argmin(), :].T
last_objective_value = 0.0
n_iter = 0
while not has_converged:
n_iter += 1
t_discrete = np.dot(vectors, rotation)
labels = t_discrete.argmax(axis=1)
vectors_discrete = csc_matrix(
(np.ones(len(labels)), (np.arange(0, n_samples), labels)),
shape=(n_samples, n_components),
)
t_svd = vectors_discrete.T * vectors
try:
U, S, Vh = np.linalg.svd(t_svd)
except LinAlgError:
svd_restarts += 1
print("SVD did not converge, randomizing and trying again")
break
ncut_value = 2.0 * (n_samples - S.sum())
if (abs(ncut_value - last_objective_value) < eps) or (n_iter > n_iter_max):
has_converged = True
else:
# otherwise calculate rotation and continue
last_objective_value = ncut_value
rotation = np.dot(Vh.T, U.T)
if not has_converged:
raise LinAlgError("SVD did not converge")
return labels
@validate_params(
{"affinity": ["array-like", "sparse matrix"]},
prefer_skip_nested_validation=False,
)
def spectral_clustering(
affinity,
*,
n_clusters=8,
n_components=None,
eigen_solver=None,
random_state=None,
n_init=10,
eigen_tol="auto",
assign_labels="kmeans",
verbose=False,
):
"""Apply clustering to a projection of the normalized Laplacian.
In practice Spectral Clustering is very useful when the structure of
the individual clusters is highly non-convex or more generally when
a measure of the center and spread of the cluster is not a suitable
description of the complete cluster. For instance, when clusters are
nested circles on the 2D plane.
If affinity is the adjacency matrix of a graph, this method can be
used to find normalized graph cuts [1]_, [2]_.
Read more in the :ref:`User Guide <spectral_clustering>`.
Parameters
----------
affinity : {array-like, sparse matrix} of shape (n_samples, n_samples)
The affinity matrix describing the relationship of the samples to
embed. **Must be symmetric**.
Possible examples:
- adjacency matrix of a graph,
- heat kernel of the pairwise distance matrix of the samples,
- symmetric k-nearest neighbours connectivity matrix of the samples.
n_clusters : int, default=None
Number of clusters to extract.
n_components : int, default=n_clusters
Number of eigenvectors to use for the spectral embedding.
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
The eigenvalue decomposition method. If None then ``'arpack'`` is used.
See [4]_ for more details regarding ``'lobpcg'``.
Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional
Algebraic MultiGrid preconditioning and requires pyamg to be installed.
It can be faster on very large sparse problems [6]_ and [7]_.
random_state : int, RandomState instance, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigenvectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
n_init : int, default=10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of n_init
consecutive runs in terms of inertia. Only used if
``assign_labels='kmeans'``.
eigen_tol : float, default="auto"
Stopping criterion for eigendecomposition of the Laplacian matrix.
If `eigen_tol="auto"` then the passed tolerance will depend on the
`eigen_solver`:
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
`eigen_tol=None` which configures the underlying `lobpcg` solver to
automatically resolve the value according to their heuristics. See,
:func:`scipy.sparse.linalg.lobpcg` for details.
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
values of `tol<1e-5` may lead to convergence issues and should be
avoided.
.. versionadded:: 1.2
Added 'auto' option.
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
The strategy to use to assign labels in the embedding
space. There are three ways to assign labels after the Laplacian
embedding. k-means can be applied and is a popular choice. But it can
also be sensitive to initialization. Discretization is another
approach which is less sensitive to random initialization [3]_.
The cluster_qr method [5]_ directly extracts clusters from eigenvectors
in spectral clustering. In contrast to k-means and discretization, cluster_qr
has no tuning parameters and is not an iterative method, yet may outperform
k-means and discretization in terms of both quality and speed.
.. versionchanged:: 1.1
Added new labeling method 'cluster_qr'.
verbose : bool, default=False
Verbosity mode.
.. versionadded:: 0.24
Returns
-------
labels : array of integers, shape: n_samples
The labels of the clusters.
Notes
-----
The graph should contain only one connected component, elsewhere
the results make little sense.
This algorithm solves the normalized cut for `k=2`: it is a
normalized spectral clustering.
References
----------
.. [1] :doi:`Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
<10.1109/34.868688>`
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
<10.1007/s11222-007-9033-z>`
.. [3] `Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
A. V. Knyazev
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
<10.1137/S1064827500366124>`
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
Anil Damle, Victor Minden, Lexing Ying
<10.1093/imaiai/iay008>`
.. [6] :doi:`Multiscale Spectral Image Segmentation Multiscale preconditioning
for computing eigenvalues of graph Laplacians in image segmentation, 2006
Andrew Knyazev
<10.13140/RG.2.2.35280.02565>`
.. [7] :doi:`Preconditioned spectral clustering for stochastic block partition
streaming graph challenge (Preliminary version at arXiv.)
David Zhuzhunashvili, Andrew Knyazev
<10.1109/HPEC.2017.8091045>`
Examples
--------
>>> import numpy as np
>>> from sklearn.metrics.pairwise import pairwise_kernels
>>> from sklearn.cluster import spectral_clustering
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> affinity = pairwise_kernels(X, metric='rbf')
>>> spectral_clustering(
... affinity=affinity, n_clusters=2, assign_labels="discretize", random_state=0
... )
array([1, 1, 1, 0, 0, 0])
"""
clusterer = SpectralClustering(
n_clusters=n_clusters,
n_components=n_components,
eigen_solver=eigen_solver,
random_state=random_state,
n_init=n_init,
affinity="precomputed",
eigen_tol=eigen_tol,
assign_labels=assign_labels,
verbose=verbose,
).fit(affinity)
return clusterer.labels_
class SpectralClustering(ClusterMixin, BaseEstimator):
"""Apply clustering to a projection of the normalized Laplacian.
In practice Spectral Clustering is very useful when the structure of
the individual clusters is highly non-convex, or more generally when
a measure of the center and spread of the cluster is not a suitable
description of the complete cluster, such as when clusters are
nested circles on the 2D plane.
If the affinity matrix is the adjacency matrix of a graph, this method
can be used to find normalized graph cuts [1]_, [2]_.
When calling ``fit``, an affinity matrix is constructed using either
a kernel function such the Gaussian (aka RBF) kernel with Euclidean
distance ``d(X, X)``::
np.exp(-gamma * d(X,X) ** 2)
or a k-nearest neighbors connectivity matrix.
Alternatively, a user-provided affinity matrix can be specified by
setting ``affinity='precomputed'``.
Read more in the :ref:`User Guide <spectral_clustering>`.
Parameters
----------
n_clusters : int, default=8
The dimension of the projection subspace.
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities. If None, then ``'arpack'`` is
used. See [4]_ for more details regarding `'lobpcg'`.
n_components : int, default=None
Number of eigenvectors to use for the spectral embedding. If None,
defaults to `n_clusters`.
random_state : int, RandomState instance, default=None
A pseudo random number generator used for the initialization
of the lobpcg eigenvectors decomposition when `eigen_solver ==
'amg'`, and for the K-Means initialization. Use an int to make
the results deterministic across calls (See
:term:`Glossary <random_state>`).
.. note::
When using `eigen_solver == 'amg'`,
it is necessary to also fix the global numpy seed with
`np.random.seed(int)` to get deterministic results. See
https://github.com/pyamg/pyamg/issues/139 for further
information.
n_init : int, default=10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of n_init
consecutive runs in terms of inertia. Only used if
``assign_labels='kmeans'``.
gamma : float, default=1.0
Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
Ignored for ``affinity='nearest_neighbors'``, ``affinity='precomputed'``
or ``affinity='precomputed_nearest_neighbors'``.
affinity : str or callable, default='rbf'
How to construct the affinity matrix.
- 'nearest_neighbors': construct the affinity matrix by computing a
graph of nearest neighbors.
- 'rbf': construct the affinity matrix using a radial basis function
(RBF) kernel.
- 'precomputed': interpret ``X`` as a precomputed affinity matrix,
where larger values indicate greater similarity between instances.
- 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph
of precomputed distances, and construct a binary affinity matrix
from the ``n_neighbors`` nearest neighbors of each instance.
- one of the kernels supported by
:func:`~sklearn.metrics.pairwise.pairwise_kernels`.
Only kernels that produce similarity scores (non-negative values that
increase with similarity) should be used. This property is not checked
by the clustering algorithm.
n_neighbors : int, default=10
Number of neighbors to use when constructing the affinity matrix using
the nearest neighbors method. Ignored for ``affinity='rbf'``.
eigen_tol : float, default="auto"
Stopping criterion for eigen decomposition of the Laplacian matrix.
If `eigen_tol="auto"` then the passed tolerance will depend on the
`eigen_solver`:
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
`eigen_tol=None` which configures the underlying `lobpcg` solver to
automatically resolve the value according to their heuristics. See,
:func:`scipy.sparse.linalg.lobpcg` for details.
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
values of `tol<1e-5` may lead to convergence issues and should be
avoided.
.. versionadded:: 1.2
Added 'auto' option.
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
The strategy for assigning labels in the embedding space. There are two
ways to assign labels after the Laplacian embedding. k-means is a
popular choice, but it can be sensitive to initialization.
Discretization is another approach which is less sensitive to random
initialization [3]_.
The cluster_qr method [5]_ directly extract clusters from eigenvectors
in spectral clustering. In contrast to k-means and discretization, cluster_qr
has no tuning parameters and runs no iterations, yet may outperform
k-means and discretization in terms of both quality and speed.
.. versionchanged:: 1.1
Added new labeling method 'cluster_qr'.
degree : float, default=3
Degree of the polynomial kernel. Ignored by other kernels.
coef0 : float, default=1
Zero coefficient for polynomial and sigmoid kernels.
Ignored by other kernels.
kernel_params : dict of str to any, default=None
Parameters (keyword arguments) and values for kernel passed as
callable object. Ignored by other kernels.
n_jobs : int, default=None
The number of parallel jobs to run when `affinity='nearest_neighbors'`
or `affinity='precomputed_nearest_neighbors'`. The neighbors search
will be done in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
verbose : bool, default=False
Verbosity mode.
.. versionadded:: 0.24
Attributes
----------
affinity_matrix_ : array-like of shape (n_samples, n_samples)
Affinity matrix used for clustering. Available only after calling
``fit``.
labels_ : ndarray of shape (n_samples,)
Labels of each point
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
sklearn.cluster.KMeans : K-Means clustering.
sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of
Applications with Noise.
Notes
-----
A distance matrix for which 0 indicates identical elements and high values
indicate very dissimilar elements can be transformed into an affinity /
similarity matrix that is well-suited for the algorithm by
applying the Gaussian (aka RBF, heat) kernel::
np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
where ``delta`` is a free parameter representing the width of the Gaussian
kernel.
An alternative is to take a symmetric version of the k-nearest neighbors
connectivity matrix of the points.
If the pyamg package is installed, it is used: this greatly
speeds up computation.
References
----------
.. [1] :doi:`Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
<10.1109/34.868688>`
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
<10.1007/s11222-007-9033-z>`
.. [3] `Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
A. V. Knyazev
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
<10.1137/S1064827500366124>`
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
Anil Damle, Victor Minden, Lexing Ying
<10.1093/imaiai/iay008>`
Examples
--------
>>> from sklearn.cluster import SpectralClustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralClustering(n_clusters=2,
... assign_labels='discretize',
... random_state=0).fit(X)
>>> clustering.labels_
array([1, 1, 1, 0, 0, 0])
>>> clustering
SpectralClustering(assign_labels='discretize', n_clusters=2,
random_state=0)
"""
_parameter_constraints: dict = {
"n_clusters": [Interval(Integral, 1, None, closed="left")],
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
"n_components": [Interval(Integral, 1, None, closed="left"), None],
"random_state": ["random_state"],
"n_init": [Interval(Integral, 1, None, closed="left")],
"gamma": [Interval(Real, 0, None, closed="left")],
"affinity": [
callable,
StrOptions(
set(KERNEL_PARAMS)
| {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"}
),
],
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
"eigen_tol": [
Interval(Real, 0.0, None, closed="left"),
StrOptions({"auto"}),
],
"assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})],
"degree": [Interval(Real, 0, None, closed="left")],
"coef0": [Interval(Real, None, None, closed="neither")],
"kernel_params": [dict, None],
"n_jobs": [Integral, None],
"verbose": ["verbose"],
}
def __init__(
self,
n_clusters=8,
*,
eigen_solver=None,
n_components=None,
random_state=None,
n_init=10,
gamma=1.0,
affinity="rbf",
n_neighbors=10,
eigen_tol="auto",
assign_labels="kmeans",
degree=3,
coef0=1,
kernel_params=None,
n_jobs=None,
verbose=False,
):
self.n_clusters = n_clusters
self.eigen_solver = eigen_solver
self.n_components = n_components
self.random_state = random_state
self.n_init = n_init
self.gamma = gamma
self.affinity = affinity
self.n_neighbors = n_neighbors
self.eigen_tol = eigen_tol
self.assign_labels = assign_labels
self.degree = degree
self.coef0 = coef0
self.kernel_params = kernel_params
self.n_jobs = n_jobs
self.verbose = verbose
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Perform spectral clustering from features, or affinity matrix.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
Training instances to cluster, similarities / affinities between
instances if ``affinity='precomputed'``, or distances between
instances if ``affinity='precomputed_nearest_neighbors``. If a
sparse matrix is provided in a format other than ``csr_matrix``,
``csc_matrix``, or ``coo_matrix``, it will be converted into a
sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
A fitted instance of the estimator.
"""
X = self._validate_data(
X,
accept_sparse=["csr", "csc", "coo"],
dtype=np.float64,
ensure_min_samples=2,
)
allow_squared = self.affinity in [
"precomputed",
"precomputed_nearest_neighbors",
]
if X.shape[0] == X.shape[1] and not allow_squared:
warnings.warn(
"The spectral clustering API has changed. ``fit``"
"now constructs an affinity matrix from data. To use"
" a custom affinity matrix, "
"set ``affinity=precomputed``."
)
if self.affinity == "nearest_neighbors":
connectivity = kneighbors_graph(
X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs
)
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
elif self.affinity == "precomputed_nearest_neighbors":
estimator = NearestNeighbors(
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
).fit(X)
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
elif self.affinity == "precomputed":
self.affinity_matrix_ = X
else:
params = self.kernel_params
if params is None:
params = {}
if not callable(self.affinity):
params["gamma"] = self.gamma
params["degree"] = self.degree
params["coef0"] = self.coef0
self.affinity_matrix_ = pairwise_kernels(
X, metric=self.affinity, filter_params=True, **params
)
random_state = check_random_state(self.random_state)
n_components = (
self.n_clusters if self.n_components is None else self.n_components
)
# We now obtain the real valued solution matrix to the
# relaxed Ncut problem, solving the eigenvalue problem
# L_sym x = lambda x and recovering u = D^-1/2 x.
# The first eigenvector is constant only for fully connected graphs
# and should be kept for spectral clustering (drop_first = False)
# See spectral_embedding documentation.
maps = _spectral_embedding(
self.affinity_matrix_,
n_components=n_components,
eigen_solver=self.eigen_solver,
random_state=random_state,
eigen_tol=self.eigen_tol,
drop_first=False,
)
if self.verbose:
print(f"Computing label assignment using {self.assign_labels}")
if self.assign_labels == "kmeans":
_, self.labels_, _ = k_means(
maps,
self.n_clusters,
random_state=random_state,
n_init=self.n_init,
verbose=self.verbose,
)
elif self.assign_labels == "cluster_qr":
self.labels_ = cluster_qr(maps)
else:
self.labels_ = discretize(maps, random_state=random_state)
return self
def fit_predict(self, X, y=None):
"""Perform spectral clustering on `X` and return cluster labels.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
Training instances to cluster, similarities / affinities between
instances if ``affinity='precomputed'``, or distances between
instances if ``affinity='precomputed_nearest_neighbors``. If a
sparse matrix is provided in a format other than ``csr_matrix``,
``csc_matrix``, or ``coo_matrix``, it will be converted into a
sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels.
"""
return super().fit_predict(X, y)
def _more_tags(self):
return {
"pairwise": self.affinity
in [
"precomputed",
"precomputed_nearest_neighbors",
]
}
@@ -0,0 +1,29 @@
cluster_extension_metadata = {
'_dbscan_inner':
{'sources': ['_dbscan_inner.pyx'], 'override_options': ['cython_language=cpp']},
'_hierarchical_fast':
{'sources': ['_hierarchical_fast.pyx', metrics_cython_tree],
'override_options': ['cython_language=cpp']},
'_k_means_common':
{'sources': ['_k_means_common.pyx']},
'_k_means_lloyd':
{'sources': ['_k_means_lloyd.pyx']},
'_k_means_elkan':
{'sources': ['_k_means_elkan.pyx']},
'_k_means_minibatch':
{'sources': ['_k_means_minibatch.pyx']},
}
foreach ext_name, ext_dict : cluster_extension_metadata
py.extension_module(
ext_name,
[ext_dict.get('sources'), utils_cython_tree],
dependencies: [np_dep, openmp_dep],
override_options : ext_dict.get('override_options', []),
cython_args: cython_args,
subdir: 'sklearn/cluster',
install: true
)
endforeach
subdir('_hdbscan')
@@ -0,0 +1,37 @@
"""
Common utilities for testing clustering.
"""
import numpy as np
###############################################################################
# Generate sample data
def generate_clustered_data(
seed=0, n_clusters=3, n_features=2, n_samples_per_cluster=20, std=0.4
):
prng = np.random.RandomState(seed)
# the data is voluntary shifted away from zero to check clustering
# algorithm robustness with regards to non centered data
means = (
np.array(
[
[1, 1, 1, 0],
[-1, -1, 0, 1],
[1, -1, 1, 1],
[-1, 1, 1, 0],
]
)
+ 10
)
X = np.empty((0, n_features))
for i in range(n_clusters):
X = np.r_[
X,
means[i][:n_features] + std * prng.randn(n_samples_per_cluster, n_features),
]
return X
@@ -0,0 +1,321 @@
"""
Testing for Clustering methods
"""
import warnings
import numpy as np
import pytest
from sklearn.cluster import AffinityPropagation, affinity_propagation
from sklearn.cluster._affinity_propagation import _equal_similarities_and_preferences
from sklearn.datasets import make_blobs
from sklearn.exceptions import ConvergenceWarning, NotFittedError
from sklearn.metrics import euclidean_distances
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS
n_clusters = 3
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(
n_samples=60,
n_features=2,
centers=centers,
cluster_std=0.4,
shuffle=True,
random_state=0,
)
# TODO: AffinityPropagation must preserve dtype for its fitted attributes
# and test must be created accordingly to this new behavior.
# For more details, see: https://github.com/scikit-learn/scikit-learn/issues/11000
def test_affinity_propagation(global_random_seed, global_dtype):
"""Test consistency of the affinity propagations."""
S = -euclidean_distances(X.astype(global_dtype, copy=False), squared=True)
preference = np.median(S) * 10
cluster_centers_indices, labels = affinity_propagation(
S, preference=preference, random_state=global_random_seed
)
n_clusters_ = len(cluster_centers_indices)
assert n_clusters == n_clusters_
def test_affinity_propagation_precomputed():
"""Check equality of precomputed affinity matrix to internally computed affinity
matrix.
"""
S = -euclidean_distances(X, squared=True)
preference = np.median(S) * 10
af = AffinityPropagation(
preference=preference, affinity="precomputed", random_state=28
)
labels_precomputed = af.fit(S).labels_
af = AffinityPropagation(preference=preference, verbose=True, random_state=37)
labels = af.fit(X).labels_
assert_array_equal(labels, labels_precomputed)
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
assert np.unique(labels).size == n_clusters_
assert n_clusters == n_clusters_
def test_affinity_propagation_no_copy():
"""Check behaviour of not copying the input data."""
S = -euclidean_distances(X, squared=True)
S_original = S.copy()
preference = np.median(S) * 10
assert not np.allclose(S.diagonal(), preference)
# with copy=True S should not be modified
affinity_propagation(S, preference=preference, copy=True, random_state=0)
assert_allclose(S, S_original)
assert not np.allclose(S.diagonal(), preference)
assert_allclose(S.diagonal(), np.zeros(S.shape[0]))
# with copy=False S will be modified inplace
affinity_propagation(S, preference=preference, copy=False, random_state=0)
assert_allclose(S.diagonal(), preference)
# test that copy=True and copy=False lead to the same result
S = S_original.copy()
af = AffinityPropagation(preference=preference, verbose=True, random_state=0)
labels = af.fit(X).labels_
_, labels_no_copy = affinity_propagation(
S, preference=preference, copy=False, random_state=74
)
assert_array_equal(labels, labels_no_copy)
def test_affinity_propagation_affinity_shape():
"""Check the shape of the affinity matrix when using `affinity_propagation."""
S = -euclidean_distances(X, squared=True)
err_msg = "The matrix of similarities must be a square array"
with pytest.raises(ValueError, match=err_msg):
affinity_propagation(S[:, :-1])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_affinity_propagation_precomputed_with_sparse_input(csr_container):
err_msg = "Sparse data was passed for X, but dense data is required"
with pytest.raises(TypeError, match=err_msg):
AffinityPropagation(affinity="precomputed").fit(csr_container((3, 3)))
def test_affinity_propagation_predict(global_random_seed, global_dtype):
# Test AffinityPropagation.predict
af = AffinityPropagation(affinity="euclidean", random_state=global_random_seed)
X_ = X.astype(global_dtype, copy=False)
labels = af.fit_predict(X_)
labels2 = af.predict(X_)
assert_array_equal(labels, labels2)
def test_affinity_propagation_predict_error():
# Test exception in AffinityPropagation.predict
# Not fitted.
af = AffinityPropagation(affinity="euclidean")
with pytest.raises(NotFittedError):
af.predict(X)
# Predict not supported when affinity="precomputed".
S = np.dot(X, X.T)
af = AffinityPropagation(affinity="precomputed", random_state=57)
af.fit(S)
with pytest.raises(ValueError, match="expecting 60 features as input"):
af.predict(X)
def test_affinity_propagation_fit_non_convergence(global_dtype):
# In case of non-convergence of affinity_propagation(), the cluster
# centers should be an empty array and training samples should be labelled
# as noise (-1)
X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
# Force non-convergence by allowing only a single iteration
af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
with pytest.warns(ConvergenceWarning):
af.fit(X)
assert_allclose(np.empty((0, 2)), af.cluster_centers_)
assert_array_equal(np.array([-1, -1, -1]), af.labels_)
def test_affinity_propagation_equal_mutual_similarities(global_dtype):
X = np.array([[-1, 1], [1, -1]], dtype=global_dtype)
S = -euclidean_distances(X, squared=True)
# setting preference > similarity
with pytest.warns(UserWarning, match="mutually equal"):
cluster_center_indices, labels = affinity_propagation(S, preference=0)
# expect every sample to become an exemplar
assert_array_equal([0, 1], cluster_center_indices)
assert_array_equal([0, 1], labels)
# setting preference < similarity
with pytest.warns(UserWarning, match="mutually equal"):
cluster_center_indices, labels = affinity_propagation(S, preference=-10)
# expect one cluster, with arbitrary (first) sample as exemplar
assert_array_equal([0], cluster_center_indices)
assert_array_equal([0, 0], labels)
# setting different preferences
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
cluster_center_indices, labels = affinity_propagation(
S, preference=[-20, -10], random_state=37
)
# expect one cluster, with highest-preference sample as exemplar
assert_array_equal([1], cluster_center_indices)
assert_array_equal([0, 0], labels)
def test_affinity_propagation_predict_non_convergence(global_dtype):
# In case of non-convergence of affinity_propagation(), the cluster
# centers should be an empty array
X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
# Force non-convergence by allowing only a single iteration
with pytest.warns(ConvergenceWarning):
af = AffinityPropagation(preference=-10, max_iter=1, random_state=75).fit(X)
# At prediction time, consider new samples as noise since there are no
# clusters
to_predict = np.array([[2, 2], [3, 3], [4, 4]])
with pytest.warns(ConvergenceWarning):
y = af.predict(to_predict)
assert_array_equal(np.array([-1, -1, -1]), y)
def test_affinity_propagation_non_convergence_regressiontest(global_dtype):
X = np.array(
[[1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 0, 0, 1]], dtype=global_dtype
)
af = AffinityPropagation(affinity="euclidean", max_iter=2, random_state=34)
msg = (
"Affinity propagation did not converge, this model may return degenerate"
" cluster centers and labels."
)
with pytest.warns(ConvergenceWarning, match=msg):
af.fit(X)
assert_array_equal(np.array([0, 0, 0]), af.labels_)
def test_equal_similarities_and_preferences(global_dtype):
# Unequal distances
X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype)
S = -euclidean_distances(X, squared=True)
assert not _equal_similarities_and_preferences(S, np.array(0))
assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Equal distances
X = np.array([[0, 0], [1, 1]], dtype=global_dtype)
S = -euclidean_distances(X, squared=True)
# Different preferences
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Same preferences
assert _equal_similarities_and_preferences(S, np.array([0, 0]))
assert _equal_similarities_and_preferences(S, np.array(0))
def test_affinity_propagation_random_state():
"""Check that different random states lead to different initialisations
by looking at the center locations after two iterations.
"""
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=300, centers=centers, cluster_std=0.5, random_state=0
)
# random_state = 0
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
ap.fit(X)
centers0 = ap.cluster_centers_
# random_state = 76
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
ap.fit(X)
centers76 = ap.cluster_centers_
# check that the centers have not yet converged to the same solution
assert np.mean((centers0 - centers76) ** 2) > 1
@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array])
def test_affinity_propagation_convergence_warning_dense_sparse(container, global_dtype):
"""
Check that having sparse or dense `centers` format should not
influence the convergence.
Non-regression test for gh-13334.
"""
centers = container(np.zeros((1, 10)))
rng = np.random.RandomState(42)
X = rng.rand(40, 10).astype(global_dtype, copy=False)
y = (4 * rng.rand(40)).astype(int)
ap = AffinityPropagation(random_state=46)
ap.fit(X, y)
ap.cluster_centers_ = centers
with warnings.catch_warnings():
warnings.simplefilter("error", ConvergenceWarning)
assert_array_equal(ap.predict(X), np.zeros(X.shape[0], dtype=int))
# FIXME; this test is broken with different random states, needs to be revisited
def test_correct_clusters(global_dtype):
# Test to fix incorrect clusters due to dtype change
# (non-regression test for issue #10832)
X = np.array(
[[1, 0, 0, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], dtype=global_dtype
)
afp = AffinityPropagation(preference=1, affinity="precomputed", random_state=0).fit(
X
)
expected = np.array([0, 1, 1, 2])
assert_array_equal(afp.labels_, expected)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_input_for_predict(csr_container):
# Test to make sure sparse inputs are accepted for predict
# (non-regression test for issue #20049)
af = AffinityPropagation(affinity="euclidean", random_state=42)
af.fit(X)
labels = af.predict(csr_container((2, 2)))
assert_array_equal(labels, (2, 2))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_input_for_fit_predict(csr_container):
# Test to make sure sparse inputs are accepted for fit_predict
# (non-regression test for issue #20049)
af = AffinityPropagation(affinity="euclidean", random_state=42)
rng = np.random.RandomState(42)
X = csr_container(rng.randint(0, 2, size=(5, 5)))
labels = af.fit_predict(X)
assert_array_equal(labels, (0, 1, 1, 2, 3))
def test_affinity_propagation_equal_points():
"""Make sure we do not assign multiple clusters to equal points.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/pull/20043
"""
X = np.zeros((8, 1))
af = AffinityPropagation(affinity="euclidean", damping=0.5, random_state=42).fit(X)
assert np.all(af.labels_ == 0)
@@ -0,0 +1,264 @@
"""Testing for Spectral Biclustering methods"""
import numpy as np
import pytest
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, BiclusterMixin
from sklearn.cluster import SpectralBiclustering, SpectralCoclustering
from sklearn.cluster._bicluster import (
_bistochastic_normalize,
_log_normalize,
_scale_normalize,
)
from sklearn.datasets import make_biclusters, make_checkerboard
from sklearn.metrics import consensus_score, v_measure_score
from sklearn.model_selection import ParameterGrid
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS
class MockBiclustering(BiclusterMixin, BaseEstimator):
# Mock object for testing get_submatrix.
def __init__(self):
pass
def get_indices(self, i):
# Overridden to reproduce old get_submatrix test.
return (
np.where([True, True, False, False, True])[0],
np.where([False, False, True, True])[0],
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_get_submatrix(csr_container):
data = np.arange(20).reshape(5, 4)
model = MockBiclustering()
for X in (data, csr_container(data), data.tolist()):
submatrix = model.get_submatrix(0, X)
if issparse(submatrix):
submatrix = submatrix.toarray()
assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]])
submatrix[:] = -1
if issparse(X):
X = X.toarray()
assert np.all(X != -1)
def _test_shape_indices(model):
# Test get_shape and get_indices on fitted model.
for i in range(model.n_clusters):
m, n = model.get_shape(i)
i_ind, j_ind = model.get_indices(i)
assert len(i_ind) == m
assert len(j_ind) == n
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_spectral_coclustering(global_random_seed, csr_container):
# Test Dhillon's Spectral CoClustering on a simple problem.
param_grid = {
"svd_method": ["randomized", "arpack"],
"n_svd_vecs": [None, 20],
"mini_batch": [False, True],
"init": ["k-means++"],
"n_init": [10],
}
S, rows, cols = make_biclusters(
(30, 30), 3, noise=0.1, random_state=global_random_seed
)
S -= S.min() # needs to be nonnegative before making it sparse
S = np.where(S < 1, 0, S) # threshold some values
for mat in (S, csr_container(S)):
for kwargs in ParameterGrid(param_grid):
model = SpectralCoclustering(
n_clusters=3, random_state=global_random_seed, **kwargs
)
model.fit(mat)
assert model.rows_.shape == (3, 30)
assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
assert consensus_score(model.biclusters_, (rows, cols)) == 1
_test_shape_indices(model)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_spectral_biclustering(global_random_seed, csr_container):
# Test Kluger methods on a checkerboard dataset.
S, rows, cols = make_checkerboard(
(30, 30), 3, noise=0.5, random_state=global_random_seed
)
non_default_params = {
"method": ["scale", "log"],
"svd_method": ["arpack"],
"n_svd_vecs": [20],
"mini_batch": [True],
}
for mat in (S, csr_container(S)):
for param_name, param_values in non_default_params.items():
for param_value in param_values:
model = SpectralBiclustering(
n_clusters=3,
n_init=3,
init="k-means++",
random_state=global_random_seed,
)
model.set_params(**dict([(param_name, param_value)]))
if issparse(mat) and model.get_params().get("method") == "log":
# cannot take log of sparse matrix
with pytest.raises(ValueError):
model.fit(mat)
continue
else:
model.fit(mat)
assert model.rows_.shape == (9, 30)
assert model.columns_.shape == (9, 30)
assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30))
assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30))
assert consensus_score(model.biclusters_, (rows, cols)) == 1
_test_shape_indices(model)
def _do_scale_test(scaled):
"""Check that rows sum to one constant, and columns to another."""
row_sum = scaled.sum(axis=1)
col_sum = scaled.sum(axis=0)
if issparse(scaled):
row_sum = np.asarray(row_sum).squeeze()
col_sum = np.asarray(col_sum).squeeze()
assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1)
assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1)
def _do_bistochastic_test(scaled):
"""Check that rows and columns sum to the same constant."""
_do_scale_test(scaled)
assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_scale_normalize(global_random_seed, csr_container):
generator = np.random.RandomState(global_random_seed)
X = generator.rand(100, 100)
for mat in (X, csr_container(X)):
scaled, _, _ = _scale_normalize(mat)
_do_scale_test(scaled)
if issparse(mat):
assert issparse(scaled)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_bistochastic_normalize(global_random_seed, csr_container):
generator = np.random.RandomState(global_random_seed)
X = generator.rand(100, 100)
for mat in (X, csr_container(X)):
scaled = _bistochastic_normalize(mat)
_do_bistochastic_test(scaled)
if issparse(mat):
assert issparse(scaled)
def test_log_normalize(global_random_seed):
# adding any constant to a log-scaled matrix should make it
# bistochastic
generator = np.random.RandomState(global_random_seed)
mat = generator.rand(100, 100)
scaled = _log_normalize(mat) + 1
_do_bistochastic_test(scaled)
def test_fit_best_piecewise(global_random_seed):
model = SpectralBiclustering(random_state=global_random_seed)
vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]])
best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
assert_array_equal(best, vectors[:2])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_project_and_cluster(global_random_seed, csr_container):
model = SpectralBiclustering(random_state=global_random_seed)
data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]])
vectors = np.array([[1, 0], [0, 1], [0, 0]])
for mat in (data, csr_container(data)):
labels = model._project_and_cluster(mat, vectors, n_clusters=2)
assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
def test_perfect_checkerboard(global_random_seed):
# XXX Previously failed on build bot (not reproducible)
model = SpectralBiclustering(
3, svd_method="arpack", random_state=global_random_seed
)
S, rows, cols = make_checkerboard(
(30, 30), 3, noise=0, random_state=global_random_seed
)
model.fit(S)
assert consensus_score(model.biclusters_, (rows, cols)) == 1
S, rows, cols = make_checkerboard(
(40, 30), 3, noise=0, random_state=global_random_seed
)
model.fit(S)
assert consensus_score(model.biclusters_, (rows, cols)) == 1
S, rows, cols = make_checkerboard(
(30, 40), 3, noise=0, random_state=global_random_seed
)
model.fit(S)
assert consensus_score(model.biclusters_, (rows, cols)) == 1
@pytest.mark.parametrize(
"params, type_err, err_msg",
[
(
{"n_clusters": 6},
ValueError,
"n_clusters should be <= n_samples=5",
),
(
{"n_clusters": (3, 3, 3)},
ValueError,
"Incorrect parameter n_clusters",
),
(
{"n_clusters": (3, 6)},
ValueError,
"Incorrect parameter n_clusters",
),
(
{"n_components": 3, "n_best": 4},
ValueError,
"n_best=4 must be <= n_components=3",
),
],
)
def test_spectralbiclustering_parameter_validation(params, type_err, err_msg):
"""Check parameters validation in `SpectralBiClustering`"""
data = np.arange(25).reshape((5, 5))
model = SpectralBiclustering(**params)
with pytest.raises(type_err, match=err_msg):
model.fit(data)
@pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering()))
def test_n_features_in_(est):
X, _, _ = make_biclusters((3, 3), 3, random_state=0)
assert not hasattr(est, "n_features_in_")
est.fit(X)
assert est.n_features_in_ == 3
@@ -0,0 +1,242 @@
"""
Tests for the birch clustering algorithm.
"""
import numpy as np
import pytest
from sklearn.cluster import AgglomerativeClustering, Birch
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.datasets import make_blobs
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import pairwise_distances_argmin, v_measure_score
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS
def test_n_samples_leaves_roots(global_random_seed, global_dtype):
# Sanity check for the number of samples in leaves and roots
X, y = make_blobs(n_samples=10, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc = Birch()
brc.fit(X)
n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
n_samples_leaves = sum(
[sc.n_samples_ for leaf in brc._get_leaves() for sc in leaf.subclusters_]
)
assert n_samples_leaves == X.shape[0]
assert n_samples_root == X.shape[0]
def test_partial_fit(global_random_seed, global_dtype):
# Test that fit is equivalent to calling partial_fit multiple times
X, y = make_blobs(n_samples=100, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc = Birch(n_clusters=3)
brc.fit(X)
brc_partial = Birch(n_clusters=None)
brc_partial.partial_fit(X[:50])
brc_partial.partial_fit(X[50:])
assert_allclose(brc_partial.subcluster_centers_, brc.subcluster_centers_)
# Test that same global labels are obtained after calling partial_fit
# with None
brc_partial.set_params(n_clusters=3)
brc_partial.partial_fit(None)
assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
def test_birch_predict(global_random_seed, global_dtype):
# Test the predict method predicts the nearest centroid.
rng = np.random.RandomState(global_random_seed)
X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10)
X = X.astype(global_dtype, copy=False)
# n_samples * n_samples_per_cluster
shuffle_indices = np.arange(30)
rng.shuffle(shuffle_indices)
X_shuffle = X[shuffle_indices, :]
brc = Birch(n_clusters=4, threshold=1.0)
brc.fit(X_shuffle)
# Birch must preserve inputs' dtype
assert brc.subcluster_centers_.dtype == global_dtype
assert_array_equal(brc.labels_, brc.predict(X_shuffle))
centroids = brc.subcluster_centers_
nearest_centroid = brc.subcluster_labels_[
pairwise_distances_argmin(X_shuffle, centroids)
]
assert_allclose(v_measure_score(nearest_centroid, brc.labels_), 1.0)
def test_n_clusters(global_random_seed, global_dtype):
# Test that n_clusters param works properly
X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc1 = Birch(n_clusters=10)
brc1.fit(X)
assert len(brc1.subcluster_centers_) > 10
assert len(np.unique(brc1.labels_)) == 10
# Test that n_clusters = Agglomerative Clustering gives
# the same results.
gc = AgglomerativeClustering(n_clusters=10)
brc2 = Birch(n_clusters=gc)
brc2.fit(X)
assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
assert_array_equal(brc1.labels_, brc2.labels_)
# Test that a small number of clusters raises a warning.
brc4 = Birch(threshold=10000.0)
with pytest.warns(ConvergenceWarning):
brc4.fit(X)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse_X(global_random_seed, global_dtype, csr_container):
# Test that sparse and dense data give same results
X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc = Birch(n_clusters=10)
brc.fit(X)
csr = csr_container(X)
brc_sparse = Birch(n_clusters=10)
brc_sparse.fit(csr)
# Birch must preserve inputs' dtype
assert brc_sparse.subcluster_centers_.dtype == global_dtype
assert_array_equal(brc.labels_, brc_sparse.labels_)
assert_allclose(brc.subcluster_centers_, brc_sparse.subcluster_centers_)
def test_partial_fit_second_call_error_checks():
# second partial fit calls will error when n_features is not consistent
# with the first call
X, y = make_blobs(n_samples=100)
brc = Birch(n_clusters=3)
brc.partial_fit(X, y)
msg = "X has 1 features, but Birch is expecting 2 features"
with pytest.raises(ValueError, match=msg):
brc.partial_fit(X[:, [0]], y)
def check_branching_factor(node, branching_factor):
subclusters = node.subclusters_
assert branching_factor >= len(subclusters)
for cluster in subclusters:
if cluster.child_:
check_branching_factor(cluster.child_, branching_factor)
def test_branching_factor(global_random_seed, global_dtype):
# Test that nodes have at max branching_factor number of subclusters
X, y = make_blobs(random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
branching_factor = 9
# Purposefully set a low threshold to maximize the subclusters.
brc = Birch(n_clusters=None, branching_factor=branching_factor, threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
brc = Birch(n_clusters=3, branching_factor=branching_factor, threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
def check_threshold(birch_instance, threshold):
"""Use the leaf linked list for traversal"""
current_leaf = birch_instance.dummy_leaf_.next_leaf_
while current_leaf:
subclusters = current_leaf.subclusters_
for sc in subclusters:
assert threshold >= sc.radius
current_leaf = current_leaf.next_leaf_
def test_threshold(global_random_seed, global_dtype):
# Test that the leaf subclusters have a threshold lesser than radius
X, y = make_blobs(n_samples=80, centers=4, random_state=global_random_seed)
X = X.astype(global_dtype, copy=False)
brc = Birch(threshold=0.5, n_clusters=None)
brc.fit(X)
check_threshold(brc, 0.5)
brc = Birch(threshold=5.0, n_clusters=None)
brc.fit(X)
check_threshold(brc, 5.0)
def test_birch_n_clusters_long_int():
# Check that birch supports n_clusters with np.int64 dtype, for instance
# coming from np.arange. #16484
X, _ = make_blobs(random_state=0)
n_clusters = np.int64(5)
Birch(n_clusters=n_clusters).fit(X)
def test_feature_names_out():
"""Check `get_feature_names_out` for `Birch`."""
X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
brc = Birch(n_clusters=4)
brc.fit(X)
n_clusters = brc.subcluster_centers_.shape[0]
names_out = brc.get_feature_names_out()
assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out)
def test_transform_match_across_dtypes(global_random_seed):
X, _ = make_blobs(n_samples=80, n_features=4, random_state=global_random_seed)
brc = Birch(n_clusters=4, threshold=1.1)
Y_64 = brc.fit_transform(X)
Y_32 = brc.fit_transform(X.astype(np.float32))
assert_allclose(Y_64, Y_32, atol=1e-6)
def test_subcluster_dtype(global_dtype):
X = make_blobs(n_samples=80, n_features=4, random_state=0)[0].astype(
global_dtype, copy=False
)
brc = Birch(n_clusters=4)
assert brc.fit(X).subcluster_centers_.dtype == global_dtype
def test_both_subclusters_updated():
"""Check that both subclusters are updated when a node a split, even when there are
duplicated data points. Non-regression test for #23269.
"""
X = np.array(
[
[-2.6192791, -1.5053215],
[-2.9993038, -1.6863596],
[-2.3724914, -1.3438171],
[-2.336792, -1.3417323],
[-2.4089134, -1.3290224],
[-2.3724914, -1.3438171],
[-3.364009, -1.8846745],
[-2.3724914, -1.3438171],
[-2.617677, -1.5003285],
[-2.2960556, -1.3260119],
[-2.3724914, -1.3438171],
[-2.5459878, -1.4533926],
[-2.25979, -1.3003055],
[-2.4089134, -1.3290224],
[-2.3724914, -1.3438171],
[-2.4089134, -1.3290224],
[-2.5459878, -1.4533926],
[-2.3724914, -1.3438171],
[-2.9720619, -1.7058647],
[-2.336792, -1.3417323],
[-2.3724914, -1.3438171],
],
dtype=np.float32,
)
# no error
Birch(branching_factor=5, threshold=1e-5, n_clusters=None).fit(X)
@@ -0,0 +1,158 @@
import numpy as np
import pytest
from sklearn.cluster import BisectingKMeans
from sklearn.metrics import v_measure_score
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS
@pytest.mark.parametrize("bisecting_strategy", ["biggest_inertia", "largest_cluster"])
@pytest.mark.parametrize("init", ["k-means++", "random"])
def test_three_clusters(bisecting_strategy, init):
"""Tries to perform bisect k-means for three clusters to check
if splitting data is performed correctly.
"""
X = np.array(
[[1, 1], [10, 1], [3, 1], [10, 0], [2, 1], [10, 2], [10, 8], [10, 9], [10, 10]]
)
bisect_means = BisectingKMeans(
n_clusters=3,
random_state=0,
bisecting_strategy=bisecting_strategy,
init=init,
)
bisect_means.fit(X)
expected_centers = [[2, 1], [10, 1], [10, 9]]
expected_labels = [0, 1, 0, 1, 0, 1, 2, 2, 2]
assert_allclose(
sorted(expected_centers), sorted(bisect_means.cluster_centers_.tolist())
)
assert_allclose(v_measure_score(expected_labels, bisect_means.labels_), 1.0)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sparse(csr_container):
"""Test Bisecting K-Means with sparse data.
Checks if labels and centers are the same between dense and sparse.
"""
rng = np.random.RandomState(0)
X = rng.rand(20, 2)
X[X < 0.8] = 0
X_csr = csr_container(X)
bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
bisect_means.fit(X_csr)
sparse_centers = bisect_means.cluster_centers_
bisect_means.fit(X)
normal_centers = bisect_means.cluster_centers_
# Check if results is the same for dense and sparse data
assert_allclose(normal_centers, sparse_centers, atol=1e-8)
@pytest.mark.parametrize("n_clusters", [4, 5])
def test_n_clusters(n_clusters):
"""Test if resulting labels are in range [0, n_clusters - 1]."""
rng = np.random.RandomState(0)
X = rng.rand(10, 2)
bisect_means = BisectingKMeans(n_clusters=n_clusters, random_state=0)
bisect_means.fit(X)
assert_array_equal(np.unique(bisect_means.labels_), np.arange(n_clusters))
def test_one_cluster():
"""Test single cluster."""
X = np.array([[1, 2], [10, 2], [10, 8]])
bisect_means = BisectingKMeans(n_clusters=1, random_state=0).fit(X)
# All labels from fit or predict should be equal 0
assert all(bisect_means.labels_ == 0)
assert all(bisect_means.predict(X) == 0)
assert_allclose(bisect_means.cluster_centers_, X.mean(axis=0).reshape(1, -1))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
def test_fit_predict(csr_container):
"""Check if labels from fit(X) method are same as from fit(X).predict(X)."""
rng = np.random.RandomState(0)
X = rng.rand(10, 2)
if csr_container is not None:
X[X < 0.8] = 0
X = csr_container(X)
bisect_means = BisectingKMeans(n_clusters=3, random_state=0)
bisect_means.fit(X)
assert_array_equal(bisect_means.labels_, bisect_means.predict(X))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
def test_dtype_preserved(csr_container, global_dtype):
"""Check that centers dtype is the same as input data dtype."""
rng = np.random.RandomState(0)
X = rng.rand(10, 2).astype(global_dtype, copy=False)
if csr_container is not None:
X[X < 0.8] = 0
X = csr_container(X)
km = BisectingKMeans(n_clusters=3, random_state=0)
km.fit(X)
assert km.cluster_centers_.dtype == global_dtype
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
def test_float32_float64_equivalence(csr_container):
"""Check that the results are the same between float32 and float64."""
rng = np.random.RandomState(0)
X = rng.rand(10, 2)
if csr_container is not None:
X[X < 0.8] = 0
X = csr_container(X)
km64 = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
km32 = BisectingKMeans(n_clusters=3, random_state=0).fit(X.astype(np.float32))
assert_allclose(km32.cluster_centers_, km64.cluster_centers_)
assert_array_equal(km32.labels_, km64.labels_)
@pytest.mark.parametrize("algorithm", ("lloyd", "elkan"))
def test_no_crash_on_empty_bisections(algorithm):
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/27081
rng = np.random.RandomState(0)
X_train = rng.rand(3000, 10)
bkm = BisectingKMeans(n_clusters=10, algorithm=algorithm).fit(X_train)
# predict on scaled data to trigger pathologic case
# where the inner mask leads to empty bisections.
X_test = 50 * rng.rand(100, 10)
labels = bkm.predict(X_test) # should not crash with idiv by 0
assert np.isin(np.unique(labels), np.arange(10)).all()
def test_one_feature():
# Check that no error is raised when there is only one feature
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/27236
X = np.random.normal(size=(128, 1))
BisectingKMeans(bisecting_strategy="biggest_inertia", random_state=0).fit(X)
@@ -0,0 +1,434 @@
"""
Tests for DBSCAN clustering algorithm
"""
import pickle
import warnings
import numpy as np
import pytest
from scipy.spatial import distance
from sklearn.cluster import DBSCAN, dbscan
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS, LIL_CONTAINERS
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
def test_dbscan_similarity():
# Tests the DBSCAN algorithm with a similarity array.
# Parameters chosen specifically for this task.
eps = 0.15
min_samples = 10
# Compute similarities
D = distance.squareform(distance.pdist(X))
D /= np.max(D)
# Compute DBSCAN
core_samples, labels = dbscan(
D, metric="precomputed", eps=eps, min_samples=min_samples
)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
labels = db.fit(D).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_feature():
# Tests the DBSCAN algorithm with a feature vector array.
# Parameters chosen specifically for this task.
# Different eps to other test, because distance is not normalised.
eps = 0.8
min_samples = 10
metric = "euclidean"
# Compute DBSCAN
# parameters chosen for task
core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
def test_dbscan_sparse(lil_container):
core_sparse, labels_sparse = dbscan(lil_container(X), eps=0.8, min_samples=10)
core_dense, labels_dense = dbscan(X, eps=0.8, min_samples=10)
assert_array_equal(core_dense, core_sparse)
assert_array_equal(labels_dense, labels_sparse)
@pytest.mark.parametrize("include_self", [False, True])
def test_dbscan_sparse_precomputed(include_self):
D = pairwise_distances(X)
nn = NearestNeighbors(radius=0.9).fit(X)
X_ = X if include_self else None
D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance")
# Ensure it is sparse not merely on diagonals:
assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
core_sparse, labels_sparse = dbscan(
D_sparse, eps=0.8, min_samples=10, metric="precomputed"
)
core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed")
assert_array_equal(core_dense, core_sparse)
assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse_precomputed_different_eps():
# test that precomputed neighbors graph is filtered if computed with
# a radius larger than DBSCAN's eps.
lower_eps = 0.2
nn = NearestNeighbors(radius=lower_eps).fit(X)
D_sparse = nn.radius_neighbors_graph(X, mode="distance")
dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
higher_eps = lower_eps + 0.7
nn = NearestNeighbors(radius=higher_eps).fit(X)
D_sparse = nn.radius_neighbors_graph(X, mode="distance")
dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric="precomputed")
assert_array_equal(dbscan_lower[0], dbscan_higher[0])
assert_array_equal(dbscan_lower[1], dbscan_higher[1])
@pytest.mark.parametrize("metric", ["precomputed", "minkowski"])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None])
def test_dbscan_input_not_modified(metric, csr_container):
# test that the input is not modified by dbscan
X = np.random.RandomState(0).rand(10, 10)
X = csr_container(X) if csr_container is not None else X
X_copy = X.copy()
dbscan(X, metric=metric)
if csr_container is not None:
assert_array_equal(X.toarray(), X_copy.toarray())
else:
assert_array_equal(X, X_copy)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container):
"""Check that we don't modify in-place the pre-computed sparse matrix.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27508
"""
X = np.random.RandomState(0).rand(10, 10)
# Add zeros on the diagonal that will be implicit when creating
# the sparse matrix. If `X` is modified in-place, the zeros from
# the diagonal will be made explicit.
np.fill_diagonal(X, 0)
X = csr_container(X)
assert all(row != col for row, col in zip(*X.nonzero()))
X_copy = X.copy()
dbscan(X, metric="precomputed")
# Make sure that we did not modify `X` in-place even by creating
# explicit 0s values.
assert X.nnz == X_copy.nnz
assert_array_equal(X.toarray(), X_copy.toarray())
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_dbscan_no_core_samples(csr_container):
rng = np.random.RandomState(0)
X = rng.rand(40, 10)
X[X < 0.8] = 0
for X_ in [X, csr_container(X)]:
db = DBSCAN(min_samples=6).fit(X_)
assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
assert_array_equal(db.labels_, -1)
assert db.core_sample_indices_.shape == (0,)
def test_dbscan_callable():
# Tests the DBSCAN algorithm with a callable metric.
# Parameters chosen specifically for this task.
# Different eps to other test, because distance is not normalised.
eps = 0.8
min_samples = 10
# metric is the function reference, not the string key.
metric = distance.euclidean
# Compute DBSCAN
# parameters chosen for task
core_samples, labels = dbscan(
X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree"
)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree")
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_metric_params():
# Tests that DBSCAN works with the metrics_params argument.
eps = 0.8
min_samples = 10
p = 1
# Compute DBSCAN with metric_params arg
with warnings.catch_warnings(record=True) as warns:
db = DBSCAN(
metric="minkowski",
metric_params={"p": p},
eps=eps,
p=None,
min_samples=min_samples,
algorithm="ball_tree",
).fit(X)
assert not warns, warns[0].message
core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
# Test that sample labels are the same as passing Minkowski 'p' directly
db = DBSCAN(
metric="minkowski", eps=eps, min_samples=min_samples, algorithm="ball_tree", p=p
).fit(X)
core_sample_2, labels_2 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_2)
assert_array_equal(labels_1, labels_2)
# Minkowski with p=1 should be equivalent to Manhattan distance
db = DBSCAN(
metric="manhattan", eps=eps, min_samples=min_samples, algorithm="ball_tree"
).fit(X)
core_sample_3, labels_3 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_3)
assert_array_equal(labels_1, labels_3)
with pytest.warns(
SyntaxWarning,
match=(
"Parameter p is found in metric_params. "
"The corresponding parameter from __init__ "
"is ignored."
),
):
# Test that checks p is ignored in favor of metric_params={'p': <val>}
db = DBSCAN(
metric="minkowski",
metric_params={"p": p},
eps=eps,
p=p + 1,
min_samples=min_samples,
algorithm="ball_tree",
).fit(X)
core_sample_4, labels_4 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_4)
assert_array_equal(labels_1, labels_4)
def test_dbscan_balltree():
# Tests the DBSCAN algorithm with balltree for neighbor calculation.
eps = 0.8
min_samples = 10
D = pairwise_distances(X)
core_samples, labels = dbscan(
D, metric="precomputed", eps=eps, min_samples=min_samples
)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree")
labels = db.fit(X).labels_
n_clusters_3 = len(set(labels)) - int(-1 in labels)
assert n_clusters_3 == n_clusters
db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree")
labels = db.fit(X).labels_
n_clusters_4 = len(set(labels)) - int(-1 in labels)
assert n_clusters_4 == n_clusters
db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree")
labels = db.fit(X).labels_
n_clusters_5 = len(set(labels)) - int(-1 in labels)
assert n_clusters_5 == n_clusters
def test_input_validation():
# DBSCAN.fit should accept a list of lists.
X = [[1.0, 2.0], [3.0, 4.0]]
DBSCAN().fit(X) # must not raise exception
def test_pickle():
obj = DBSCAN()
s = pickle.dumps(obj)
assert type(pickle.loads(s)) == obj.__class__
def test_boundaries():
# ensure min_samples is inclusive of core point
core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
assert 0 in core
# ensure eps is inclusive of circumference
core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
assert 0 in core
core, _ = dbscan([[0], [1], [1]], eps=0.99, min_samples=2)
assert 0 not in core
def test_weighted_dbscan(global_random_seed):
# ensure sample_weight is validated
with pytest.raises(ValueError):
dbscan([[0], [1]], sample_weight=[2])
with pytest.raises(ValueError):
dbscan([[0], [1]], sample_weight=[2, 3, 4])
# ensure sample_weight has an effect
assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0])
assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0])
assert_array_equal(
[0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]
)
# points within eps of each other:
assert_array_equal(
[0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]
)
# and effect of non-positive and non-integer sample_weight:
assert_array_equal(
[], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]
)
assert_array_equal(
[0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]
)
assert_array_equal(
[0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]
)
assert_array_equal(
[], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]
)
# for non-negative sample_weight, cores should be identical to repetition
rng = np.random.RandomState(global_random_seed)
sample_weight = rng.randint(0, 5, X.shape[0])
core1, label1 = dbscan(X, sample_weight=sample_weight)
assert len(label1) == len(X)
X_repeated = np.repeat(X, sample_weight, axis=0)
core_repeated, label_repeated = dbscan(X_repeated)
core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
core_repeated_mask[core_repeated] = True
core_mask = np.zeros(X.shape[0], dtype=bool)
core_mask[core1] = True
assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)
# sample_weight should work with precomputed distance matrix
D = pairwise_distances(X)
core3, label3 = dbscan(D, sample_weight=sample_weight, metric="precomputed")
assert_array_equal(core1, core3)
assert_array_equal(label1, label3)
# sample_weight should work with estimator
est = DBSCAN().fit(X, sample_weight=sample_weight)
core4 = est.core_sample_indices_
label4 = est.labels_
assert_array_equal(core1, core4)
assert_array_equal(label1, label4)
est = DBSCAN()
label5 = est.fit_predict(X, sample_weight=sample_weight)
core5 = est.core_sample_indices_
assert_array_equal(core1, core5)
assert_array_equal(label1, label5)
assert_array_equal(label1, est.labels_)
@pytest.mark.parametrize("algorithm", ["brute", "kd_tree", "ball_tree"])
def test_dbscan_core_samples_toy(algorithm):
X = [[0], [2], [3], [4], [6], [8], [10]]
n_samples = len(X)
# Degenerate case: every sample is a core sample, either with its own
# cluster or including other close core samples.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1)
assert_array_equal(core_samples, np.arange(n_samples))
assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])
# With eps=1 and min_samples=2 only the 3 samples from the denser area
# are core samples. All other points are isolated and considered noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2)
assert_array_equal(core_samples, [1, 2, 3])
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
# Only the sample in the middle of the dense area is core. Its two
# neighbors are edge samples. Remaining samples are noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3)
assert_array_equal(core_samples, [2])
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
# It's no longer possible to extract core samples with eps=1:
# everything is noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4)
assert_array_equal(core_samples, [])
assert_array_equal(labels, np.full(n_samples, -1.0))
def test_dbscan_precomputed_metric_with_degenerate_input_arrays():
# see https://github.com/scikit-learn/scikit-learn/issues/4641 for
# more details
X = np.eye(10)
labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
assert len(set(labels)) == 1
X = np.zeros((10, 10))
labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_
assert len(set(labels)) == 1
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_dbscan_precomputed_metric_with_initial_rows_zero(csr_container):
# sample matrix with initial two row all zero
ar = np.array(
[
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
[0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
[0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0],
]
)
matrix = csr_container(ar)
labels = DBSCAN(eps=0.2, metric="precomputed", min_samples=2).fit(matrix).labels_
assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])
@@ -0,0 +1,81 @@
"""
Tests for sklearn.cluster._feature_agglomeration
"""
# Authors: Sergul Aydore 2017
import warnings
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn.cluster import FeatureAgglomeration
from sklearn.datasets import make_blobs
from sklearn.utils._testing import assert_array_almost_equal
def test_feature_agglomeration():
n_clusters = 1
X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features)
agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean)
agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median)
agglo_mean.fit(X)
agglo_median.fit(X)
assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
assert np.size(np.unique(agglo_median.labels_)) == n_clusters
assert np.size(agglo_mean.labels_) == X.shape[1]
assert np.size(agglo_median.labels_) == X.shape[1]
# Test transform
Xt_mean = agglo_mean.transform(X)
Xt_median = agglo_median.transform(X)
assert Xt_mean.shape[1] == n_clusters
assert Xt_median.shape[1] == n_clusters
assert Xt_mean == np.array([1 / 3.0])
assert Xt_median == np.array([0.0])
# Test inverse transform
X_full_mean = agglo_mean.inverse_transform(Xt_mean)
X_full_median = agglo_median.inverse_transform(Xt_median)
assert np.unique(X_full_mean[0]).size == n_clusters
assert np.unique(X_full_median[0]).size == n_clusters
assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def test_feature_agglomeration_feature_names_out():
"""Check `get_feature_names_out` for `FeatureAgglomeration`."""
X, _ = make_blobs(n_features=6, random_state=0)
agglo = FeatureAgglomeration(n_clusters=3)
agglo.fit(X)
n_clusters = agglo.n_clusters_
names_out = agglo.get_feature_names_out()
assert_array_equal(
[f"featureagglomeration{i}" for i in range(n_clusters)], names_out
)
# TODO(1.7): remove this test
def test_inverse_transform_Xt_deprecation():
X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features)
est = FeatureAgglomeration(n_clusters=1, pooling_func=np.mean)
est.fit(X)
X = est.transform(X)
with pytest.raises(TypeError, match="Missing required positional argument"):
est.inverse_transform()
with pytest.raises(TypeError, match="Cannot use both X and Xt. Use X only."):
est.inverse_transform(X=X, Xt=X)
with warnings.catch_warnings(record=True):
warnings.simplefilter("error")
est.inverse_transform(X)
with pytest.warns(FutureWarning, match="Xt was renamed X in version 1.5"):
est.inverse_transform(Xt=X)
@@ -0,0 +1,602 @@
"""
Tests for HDBSCAN clustering algorithm
Based on the DBSCAN test code
"""
import numpy as np
import pytest
from scipy import stats
from scipy.spatial import distance
from sklearn.cluster import HDBSCAN
from sklearn.cluster._hdbscan._tree import (
CONDENSED_dtype,
_condense_tree,
_do_labelling,
)
from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING
from sklearn.datasets import make_blobs
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics.pairwise import _VALID_METRICS, euclidean_distances
from sklearn.neighbors import BallTree, KDTree
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
X, y = make_blobs(n_samples=200, random_state=10)
X, y = shuffle(X, y, random_state=7)
X = StandardScaler().fit_transform(X)
ALGORITHMS = [
"kd_tree",
"ball_tree",
"brute",
"auto",
]
OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()}
def check_label_quality(labels, threshold=0.99):
n_clusters = len(set(labels) - OUTLIER_SET)
assert n_clusters == 3
assert fowlkes_mallows_score(labels, y) > threshold
@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING)
def test_outlier_data(outlier_type):
"""
Tests if np.inf and np.nan data are each treated as special outliers.
"""
outlier = {
"infinite": np.inf,
"missing": np.nan,
}[outlier_type]
prob_check = {
"infinite": lambda x, y: x == y,
"missing": lambda x, y: np.isnan(x),
}[outlier_type]
label = _OUTLIER_ENCODING[outlier_type]["label"]
prob = _OUTLIER_ENCODING[outlier_type]["prob"]
X_outlier = X.copy()
X_outlier[0] = [outlier, 1]
X_outlier[5] = [outlier, outlier]
model = HDBSCAN().fit(X_outlier)
(missing_labels_idx,) = (model.labels_ == label).nonzero()
assert_array_equal(missing_labels_idx, [0, 5])
(missing_probs_idx,) = (prob_check(model.probabilities_, prob)).nonzero()
assert_array_equal(missing_probs_idx, [0, 5])
clean_indices = list(range(1, 5)) + list(range(6, 200))
clean_model = HDBSCAN().fit(X_outlier[clean_indices])
assert_array_equal(clean_model.labels_, model.labels_[clean_indices])
def test_hdbscan_distance_matrix():
"""
Tests that HDBSCAN works with precomputed distance matrices, and throws the
appropriate errors when needed.
"""
D = euclidean_distances(X)
D_original = D.copy()
labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D)
assert_allclose(D, D_original)
check_label_quality(labels)
msg = r"The precomputed distance matrix.*has shape"
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="precomputed", copy=True).fit_predict(X)
msg = r"The precomputed distance matrix.*values"
# Ensure the matrix is not symmetric
D[0, 1] = 10
D[1, 0] = 1
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="precomputed").fit_predict(D)
@pytest.mark.parametrize("sparse_constructor", [*CSR_CONTAINERS, *CSC_CONTAINERS])
def test_hdbscan_sparse_distance_matrix(sparse_constructor):
"""
Tests that HDBSCAN works with sparse distance matrices.
"""
D = distance.squareform(distance.pdist(X))
D /= np.max(D)
threshold = stats.scoreatpercentile(D.flatten(), 50)
D[D >= threshold] = 0.0
D = sparse_constructor(D)
D.eliminate_zeros()
labels = HDBSCAN(metric="precomputed").fit_predict(D)
check_label_quality(labels)
def test_hdbscan_feature_array():
"""
Tests that HDBSCAN works with feature array, including an arbitrary
goodness of fit check. Note that the check is a simple heuristic.
"""
labels = HDBSCAN().fit_predict(X)
# Check that clustering is arbitrarily good
# This is a heuristic to guard against regression
check_label_quality(labels)
@pytest.mark.parametrize("algo", ALGORITHMS)
@pytest.mark.parametrize("metric", _VALID_METRICS)
def test_hdbscan_algorithms(algo, metric):
"""
Tests that HDBSCAN works with the expected combinations of algorithms and
metrics, or raises the expected errors.
"""
labels = HDBSCAN(algorithm=algo).fit_predict(X)
check_label_quality(labels)
# Validation for brute is handled by `pairwise_distances`
if algo in ("brute", "auto"):
return
ALGOS_TREES = {
"kd_tree": KDTree,
"ball_tree": BallTree,
}
metric_params = {
"mahalanobis": {"V": np.eye(X.shape[1])},
"seuclidean": {"V": np.ones(X.shape[1])},
"minkowski": {"p": 2},
"wminkowski": {"p": 2, "w": np.ones(X.shape[1])},
}.get(metric, None)
hdb = HDBSCAN(
algorithm=algo,
metric=metric,
metric_params=metric_params,
)
if metric not in ALGOS_TREES[algo].valid_metrics:
with pytest.raises(ValueError):
hdb.fit(X)
elif metric == "wminkowski":
with pytest.warns(FutureWarning):
hdb.fit(X)
else:
hdb.fit(X)
def test_dbscan_clustering():
"""
Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
This test is more of a sanity check than a rigorous evaluation.
"""
clusterer = HDBSCAN().fit(X)
labels = clusterer.dbscan_clustering(0.3)
# We use a looser threshold due to dbscan producing a more constrained
# clustering representation
check_label_quality(labels, threshold=0.92)
@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1))
def test_dbscan_clustering_outlier_data(cut_distance):
"""
Tests if np.inf and np.nan data are each treated as special outliers.
"""
missing_label = _OUTLIER_ENCODING["missing"]["label"]
infinite_label = _OUTLIER_ENCODING["infinite"]["label"]
X_outlier = X.copy()
X_outlier[0] = [np.inf, 1]
X_outlier[2] = [1, np.nan]
X_outlier[5] = [np.inf, np.nan]
model = HDBSCAN().fit(X_outlier)
labels = model.dbscan_clustering(cut_distance=cut_distance)
missing_labels_idx = np.flatnonzero(labels == missing_label)
assert_array_equal(missing_labels_idx, [2, 5])
infinite_labels_idx = np.flatnonzero(labels == infinite_label)
assert_array_equal(infinite_labels_idx, [0])
clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx))
clean_model = HDBSCAN().fit(X_outlier[clean_idx])
clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance)
assert_array_equal(clean_labels, labels[clean_idx])
def test_hdbscan_best_balltree_metric():
"""
Tests that HDBSCAN using `BallTree` works.
"""
labels = HDBSCAN(
metric="seuclidean", metric_params={"V": np.ones(X.shape[1])}
).fit_predict(X)
check_label_quality(labels)
def test_hdbscan_no_clusters():
"""
Tests that HDBSCAN correctly does not generate a valid cluster when the
`min_cluster_size` is too large for the data.
"""
labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X)
assert set(labels).issubset(OUTLIER_SET)
def test_hdbscan_min_cluster_size():
"""
Test that the smallest non-noise cluster has at least `min_cluster_size`
many points
"""
for min_cluster_size in range(2, len(X), 1):
labels = HDBSCAN(min_cluster_size=min_cluster_size).fit_predict(X)
true_labels = [label for label in labels if label != -1]
if len(true_labels) != 0:
assert np.min(np.bincount(true_labels)) >= min_cluster_size
def test_hdbscan_callable_metric():
"""
Tests that HDBSCAN works when passed a callable metric.
"""
metric = distance.euclidean
labels = HDBSCAN(metric=metric).fit_predict(X)
check_label_quality(labels)
@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"])
def test_hdbscan_precomputed_non_brute(tree):
"""
Tests that HDBSCAN correctly raises an error when passing precomputed data
while requesting a tree-based algorithm.
"""
hdb = HDBSCAN(metric="precomputed", algorithm=tree)
msg = "precomputed is not a valid metric for"
with pytest.raises(ValueError, match=msg):
hdb.fit(X)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_hdbscan_sparse(csr_container):
"""
Tests that HDBSCAN works correctly when passing sparse feature data.
Evaluates correctness by comparing against the same data passed as a dense
array.
"""
dense_labels = HDBSCAN().fit(X).labels_
check_label_quality(dense_labels)
_X_sparse = csr_container(X)
X_sparse = _X_sparse.copy()
sparse_labels = HDBSCAN().fit(X_sparse).labels_
assert_array_equal(dense_labels, sparse_labels)
# Compare that the sparse and dense non-precomputed routines return the same labels
# where the 0th observation contains the outlier.
for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")):
X_dense = X.copy()
X_dense[0, 0] = outlier_val
dense_labels = HDBSCAN().fit(X_dense).labels_
check_label_quality(dense_labels)
assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"]
X_sparse = _X_sparse.copy()
X_sparse[0, 0] = outlier_val
sparse_labels = HDBSCAN().fit(X_sparse).labels_
assert_array_equal(dense_labels, sparse_labels)
msg = "Sparse data matrices only support algorithm `brute`."
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse)
@pytest.mark.parametrize("algorithm", ALGORITHMS)
def test_hdbscan_centers(algorithm):
"""
Tests that HDBSCAN centers are calculated and stored properly, and are
accurate to the data.
"""
centers = [(0.0, 0.0), (3.0, 3.0)]
H, _ = make_blobs(n_samples=2000, random_state=0, centers=centers, cluster_std=0.5)
hdb = HDBSCAN(store_centers="both").fit(H)
for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_):
assert_allclose(center, centroid, rtol=1, atol=0.05)
assert_allclose(center, medoid, rtol=1, atol=0.05)
# Ensure that nothing is done for noise
hdb = HDBSCAN(
algorithm=algorithm, store_centers="both", min_cluster_size=X.shape[0]
).fit(X)
assert hdb.centroids_.shape[0] == 0
assert hdb.medoids_.shape[0] == 0
def test_hdbscan_allow_single_cluster_with_epsilon():
"""
Tests that HDBSCAN single-cluster selection with epsilon works correctly.
"""
rng = np.random.RandomState(0)
no_structure = rng.rand(150, 2)
# without epsilon we should see many noise points as children of root.
labels = HDBSCAN(
min_cluster_size=5,
cluster_selection_epsilon=0.0,
cluster_selection_method="eom",
allow_single_cluster=True,
).fit_predict(no_structure)
unique_labels, counts = np.unique(labels, return_counts=True)
assert len(unique_labels) == 2
# Arbitrary heuristic. Would prefer something more precise.
assert counts[unique_labels == -1] > 30
# for this random seed an epsilon of 0.18 will produce exactly 2 noise
# points at that cut in single linkage.
labels = HDBSCAN(
min_cluster_size=5,
cluster_selection_epsilon=0.18,
cluster_selection_method="eom",
allow_single_cluster=True,
algorithm="kd_tree",
).fit_predict(no_structure)
unique_labels, counts = np.unique(labels, return_counts=True)
assert len(unique_labels) == 2
assert counts[unique_labels == -1] == 2
def test_hdbscan_better_than_dbscan():
"""
Validate that HDBSCAN can properly cluster this difficult synthetic
dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
example)
"""
centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]]
X, y = make_blobs(
n_samples=750,
centers=centers,
cluster_std=[0.2, 0.35, 1.35, 1.35],
random_state=0,
)
labels = HDBSCAN().fit(X).labels_
n_clusters = len(set(labels)) - int(-1 in labels)
assert n_clusters == 4
fowlkes_mallows_score(labels, y) > 0.99
@pytest.mark.parametrize(
"kwargs, X",
[
({"metric": "precomputed"}, np.array([[1, np.inf], [np.inf, 1]])),
({"metric": "precomputed"}, [[1, 2], [2, 1]]),
({}, [[1, 2], [3, 4]]),
],
)
def test_hdbscan_usable_inputs(X, kwargs):
"""
Tests that HDBSCAN works correctly for array-likes and precomputed inputs
with non-finite points.
"""
HDBSCAN(min_samples=1, **kwargs).fit(X)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_hdbscan_sparse_distances_too_few_nonzero(csr_container):
"""
Tests that HDBSCAN raises the correct error when there are too few
non-zero distances.
"""
X = csr_container(np.zeros((10, 10)))
msg = "There exists points with fewer than"
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="precomputed").fit(X)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_hdbscan_sparse_distances_disconnected_graph(csr_container):
"""
Tests that HDBSCAN raises the correct error when the distance matrix
has multiple connected components.
"""
# Create symmetric sparse matrix with 2 connected components
X = np.zeros((20, 20))
X[:5, :5] = 1
X[5:, 15:] = 1
X = X + X.T
X = csr_container(X)
msg = "HDBSCAN cannot be perfomed on a disconnected graph"
with pytest.raises(ValueError, match=msg):
HDBSCAN(metric="precomputed").fit(X)
def test_hdbscan_tree_invalid_metric():
"""
Tests that HDBSCAN correctly raises an error for invalid metric choices.
"""
metric_callable = lambda x: x
msg = (
".* is not a valid metric for a .*-based algorithm\\. Please select a different"
" metric\\."
)
# Callables are not supported for either
with pytest.raises(ValueError, match=msg):
HDBSCAN(algorithm="kd_tree", metric=metric_callable).fit(X)
with pytest.raises(ValueError, match=msg):
HDBSCAN(algorithm="ball_tree", metric=metric_callable).fit(X)
# The set of valid metrics for KDTree at the time of writing this test is a
# strict subset of those supported in BallTree
metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics))
if len(metrics_not_kd) > 0:
with pytest.raises(ValueError, match=msg):
HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0]).fit(X)
def test_hdbscan_too_many_min_samples():
"""
Tests that HDBSCAN correctly raises an error when setting `min_samples`
larger than the number of samples.
"""
hdb = HDBSCAN(min_samples=len(X) + 1)
msg = r"min_samples (.*) must be at most"
with pytest.raises(ValueError, match=msg):
hdb.fit(X)
def test_hdbscan_precomputed_dense_nan():
"""
Tests that HDBSCAN correctly raises an error when providing precomputed
distances with `np.nan` values.
"""
X_nan = X.copy()
X_nan[0, 0] = np.nan
msg = "np.nan values found in precomputed-dense"
hdb = HDBSCAN(metric="precomputed")
with pytest.raises(ValueError, match=msg):
hdb.fit(X_nan)
@pytest.mark.parametrize("allow_single_cluster", [True, False])
@pytest.mark.parametrize("epsilon", [0, 0.1])
def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon):
"""
Tests that the `_do_labelling` helper function correctly assigns labels.
"""
n_samples = 48
X, y = make_blobs(
n_samples,
random_state=global_random_seed,
# Ensure the clusters are distinct with no overlap
centers=[
[0, 0],
[10, 0],
[0, 10],
],
)
est = HDBSCAN().fit(X)
condensed_tree = _condense_tree(
est._single_linkage_tree_, min_cluster_size=est.min_cluster_size
)
clusters = {n_samples + 2, n_samples + 3, n_samples + 4}
cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2}
labels = _do_labelling(
condensed_tree=condensed_tree,
clusters=clusters,
cluster_label_map=cluster_label_map,
allow_single_cluster=allow_single_cluster,
cluster_selection_epsilon=epsilon,
)
first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))}
y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))}
aligned_target = np.vectorize(y_to_labels.get)(y)
assert_array_equal(labels, aligned_target)
def test_labelling_thresholding():
"""
Tests that the `_do_labelling` helper function correctly thresholds the
incoming lambda values given various `cluster_selection_epsilon` values.
"""
n_samples = 5
MAX_LAMBDA = 1.5
condensed_tree = np.array(
[
(5, 2, MAX_LAMBDA, 1),
(5, 1, 0.1, 1),
(5, 0, MAX_LAMBDA, 1),
(5, 3, 0.2, 1),
(5, 4, 0.3, 1),
],
dtype=CONDENSED_dtype,
)
labels = _do_labelling(
condensed_tree=condensed_tree,
clusters={n_samples},
cluster_label_map={n_samples: 0, n_samples + 1: 1},
allow_single_cluster=True,
cluster_selection_epsilon=1,
)
num_noise = condensed_tree["value"] < 1
assert sum(num_noise) == sum(labels == -1)
labels = _do_labelling(
condensed_tree=condensed_tree,
clusters={n_samples},
cluster_label_map={n_samples: 0, n_samples + 1: 1},
allow_single_cluster=True,
cluster_selection_epsilon=0,
)
# The threshold should be calculated per-sample based on the largest
# lambda of any simbling node. In this case, all points are siblings
# and the largest value is exactly MAX_LAMBDA.
num_noise = condensed_tree["value"] < MAX_LAMBDA
assert sum(num_noise) == sum(labels == -1)
# TODO(1.6): Remove
def test_hdbscan_warning_on_deprecated_algorithm_name():
# Test that warning message is shown when algorithm='kdtree'
msg = (
"`algorithm='kdtree'`has been deprecated in 1.4 and will be renamed"
" to'kd_tree'`in 1.6. To keep the past behaviour, set `algorithm='kd_tree'`."
)
with pytest.warns(FutureWarning, match=msg):
HDBSCAN(algorithm="kdtree").fit(X)
# Test that warning message is shown when algorithm='balltree'
msg = (
"`algorithm='balltree'`has been deprecated in 1.4 and will be renamed"
" to'ball_tree'`in 1.6. To keep the past behaviour, set"
" `algorithm='ball_tree'`."
)
with pytest.warns(FutureWarning, match=msg):
HDBSCAN(algorithm="balltree").fit(X)
@pytest.mark.parametrize("store_centers", ["centroid", "medoid"])
def test_hdbscan_error_precomputed_and_store_centers(store_centers):
"""Check that we raise an error if the centers are requested together with
a precomputed input matrix.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27893
"""
rng = np.random.RandomState(0)
X = rng.random((100, 2))
X_dist = euclidean_distances(X)
err_msg = "Cannot store centers when using a precomputed distance matrix."
with pytest.raises(ValueError, match=err_msg):
HDBSCAN(metric="precomputed", store_centers=store_centers).fit(X_dist)
@pytest.mark.parametrize("valid_algo", ["auto", "brute"])
def test_hdbscan_cosine_metric_valid_algorithm(valid_algo):
"""Test that HDBSCAN works with the "cosine" metric when the algorithm is set
to "brute" or "auto".
Non-regression test for issue #28631
"""
HDBSCAN(metric="cosine", algorithm=valid_algo).fit_predict(X)
@pytest.mark.parametrize("invalid_algo", ["kd_tree", "ball_tree"])
def test_hdbscan_cosine_metric_invalid_algorithm(invalid_algo):
"""Test that HDBSCAN raises an informative error is raised when an unsupported
algorithm is used with the "cosine" metric.
"""
hdbscan = HDBSCAN(metric="cosine", algorithm=invalid_algo)
with pytest.raises(ValueError, match="cosine is not a valid metric"):
hdbscan.fit_predict(X)
@@ -0,0 +1,900 @@
"""
Several basic tests for hierarchical clustering procedures
"""
# Authors: Vincent Michel, 2010, Gael Varoquaux 2012,
# Matteo Visconti di Oleggio Castello 2014
# License: BSD 3 clause
import itertools
import shutil
from functools import partial
from tempfile import mkdtemp
import numpy as np
import pytest
from scipy.cluster import hierarchy
from scipy.sparse.csgraph import connected_components
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration, ward_tree
from sklearn.cluster._agglomerative import (
_TREE_BUILDERS,
_fix_connectivity,
_hc_cut,
linkage_tree,
)
from sklearn.cluster._hierarchical_fast import (
average_merge,
max_merge,
mst_linkage_core,
)
from sklearn.datasets import make_circles, make_moons
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.metrics import DistanceMetric
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics.pairwise import (
PAIRED_DISTANCES,
cosine_distances,
manhattan_distances,
pairwise_distances,
)
from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
from sklearn.neighbors import kneighbors_graph
from sklearn.utils._fast_dict import IntFloatDict
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
create_memmap_backed_data,
ignore_warnings,
)
from sklearn.utils.fixes import LIL_CONTAINERS
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
with pytest.raises(ValueError):
linkage_tree(X, linkage="foo")
with pytest.raises(ValueError):
linkage_tree(X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hierarchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hierarchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_structured_linkage_tree():
# Check that we obtain the correct solution for structured linkage trees.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=bool)
# Avoiding a mask with only 'True' entries
mask[4:7, 4:7] = 0
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for tree_builder in _TREE_BUILDERS.values():
children, n_components, n_leaves, parent = tree_builder(
X.T, connectivity=connectivity
)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
# Check that ward_tree raises a ValueError with a connectivity matrix
# of the wrong shape
with pytest.raises(ValueError):
tree_builder(X.T, connectivity=np.ones((4, 4)))
# Check that fitting with no samples raises an error
with pytest.raises(ValueError):
tree_builder(X.T[:0], connectivity=connectivity)
def test_unstructured_linkage_tree():
# Check that we obtain the correct solution for unstructured linkage trees.
rng = np.random.RandomState(0)
X = rng.randn(50, 100)
for this_X in (X, X[0]):
# With specified a number of clusters just for the sake of
# raising a warning and testing the warning code
with ignore_warnings():
with pytest.warns(UserWarning):
children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
for tree_builder in _TREE_BUILDERS.values():
for this_X in (X, X[0]):
with ignore_warnings():
with pytest.warns(UserWarning):
children, n_nodes, n_leaves, parent = tree_builder(
this_X.T, n_clusters=10
)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
def test_height_linkage_tree():
# Check that the height of the results of linkage tree is sorted.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for linkage_func in _TREE_BUILDERS.values():
children, n_nodes, n_leaves, parent = linkage_func(
X.T, connectivity=connectivity
)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
def test_zero_cosine_linkage_tree():
# Check that zero vectors in X produce an error when
# 'cosine' affinity is used
X = np.array([[0, 1], [0, 0]])
msg = "Cosine affinity cannot be used when X contains zero vectors"
with pytest.raises(ValueError, match=msg):
linkage_tree(X, affinity="cosine")
@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)])
@pytest.mark.parametrize("compute_distances", [True, False])
@pytest.mark.parametrize("linkage", ["ward", "complete", "average", "single"])
def test_agglomerative_clustering_distances(
n_clusters, compute_distances, distance_threshold, linkage
):
# Check that when `compute_distances` is True or `distance_threshold` is
# given, the fitted model has an attribute `distances_`.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
clustering = AgglomerativeClustering(
n_clusters=n_clusters,
connectivity=connectivity,
linkage=linkage,
distance_threshold=distance_threshold,
compute_distances=compute_distances,
)
clustering.fit(X)
if compute_distances or (distance_threshold is not None):
assert hasattr(clustering, "distances_")
n_children = clustering.children_.shape[0]
n_nodes = n_children + 1
assert clustering.distances_.shape == (n_nodes - 1,)
else:
assert not hasattr(clustering, "distances_")
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
def test_agglomerative_clustering(global_random_seed, lil_container):
# Check that we obtain the correct number of clusters with
# agglomerative clustering.
rng = np.random.RandomState(global_random_seed)
mask = np.ones([10, 10], dtype=bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
for linkage in ("ward", "complete", "average", "single"):
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity, linkage=linkage
)
clustering.fit(X)
# test caching
try:
tempdir = mkdtemp()
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity,
memory=tempdir,
linkage=linkage,
)
clustering.fit(X)
labels = clustering.labels_
assert np.size(np.unique(labels)) == 10
finally:
shutil.rmtree(tempdir)
# Turn caching off now
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity, linkage=linkage
)
# Check that we obtain the same solution with early-stopping of the
# tree building
clustering.compute_full_tree = False
clustering.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1)
clustering.connectivity = None
clustering.fit(X)
assert np.size(np.unique(clustering.labels_)) == 10
# Check that we raise a TypeError on dense matrices
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=lil_container(connectivity.toarray()[:10, :10]),
linkage=linkage,
)
with pytest.raises(ValueError):
clustering.fit(X)
# Test that using ward with another metric than euclidean raises an
# exception
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity.toarray(),
metric="manhattan",
linkage="ward",
)
with pytest.raises(ValueError):
clustering.fit(X)
# Test using another metric than euclidean works with linkage complete
for metric in PAIRED_DISTANCES.keys():
# Compare our (structured) implementation to scipy
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=np.ones((n_samples, n_samples)),
metric=metric,
linkage="complete",
)
clustering.fit(X)
clustering2 = AgglomerativeClustering(
n_clusters=10, connectivity=None, metric=metric, linkage="complete"
)
clustering2.fit(X)
assert_almost_equal(
normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1
)
# Test that using a distance matrix (affinity = 'precomputed') has same
# results (with connectivity constraints)
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity, linkage="complete"
)
clustering.fit(X)
X_dist = pairwise_distances(X)
clustering2 = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity,
metric="precomputed",
linkage="complete",
)
clustering2.fit(X_dist)
assert_array_equal(clustering.labels_, clustering2.labels_)
def test_agglomerative_clustering_memory_mapped():
"""AgglomerativeClustering must work on mem-mapped dataset.
Non-regression test for issue #19875.
"""
rng = np.random.RandomState(0)
Xmm = create_memmap_backed_data(rng.randn(50, 100))
AgglomerativeClustering(metric="euclidean", linkage="single").fit(Xmm)
def test_ward_agglomeration(global_random_seed):
# Check that we obtain the correct solution in a simplistic case
rng = np.random.RandomState(global_random_seed)
mask = np.ones([10, 10], dtype=bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
agglo.fit(X)
assert np.size(np.unique(agglo.labels_)) == 5
X_red = agglo.transform(X)
assert X_red.shape[1] == 5
X_full = agglo.inverse_transform(X_red)
assert np.unique(X_full[0]).size == 5
assert_array_almost_equal(agglo.transform(X_full), X_red)
# Check that fitting with no samples raises a ValueError
with pytest.raises(ValueError):
agglo.fit(X[:0])
def test_single_linkage_clustering():
# Check that we get the correct result in two emblematic cases
moons, moon_labels = make_moons(noise=0.05, random_state=42)
clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
clustering.fit(moons)
assert_almost_equal(
normalized_mutual_info_score(clustering.labels_, moon_labels), 1
)
circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42)
clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
clustering.fit(circles)
assert_almost_equal(
normalized_mutual_info_score(clustering.labels_, circle_labels), 1
)
def assess_same_labelling(cut1, cut2):
"""Util for comparison with scipy"""
co_clust = []
for cut in [cut1, cut2]:
n = len(cut)
k = cut.max() + 1
ecut = np.zeros((n, k))
ecut[np.arange(n), cut] = 1
co_clust.append(np.dot(ecut, ecut.T))
assert (co_clust[0] == co_clust[1]).all()
def test_sparse_scikit_vs_scipy(global_random_seed):
# Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
n, p, k = 10, 5, 3
rng = np.random.RandomState(global_random_seed)
# Not using a lil_matrix here, just to check that non sparse
# matrices are well handled
connectivity = np.ones((n, n))
for linkage in _TREE_BUILDERS.keys():
for i in range(5):
X = 0.1 * rng.normal(size=(n, p))
X -= 4.0 * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method=linkage)
children_ = out[:, :2].astype(int, copy=False)
children, _, n_leaves, _ = _TREE_BUILDERS[linkage](
X, connectivity=connectivity
)
# Sort the order of child nodes per row for consistency
children.sort(axis=1)
assert_array_equal(
children,
children_,
"linkage tree differs from scipy impl for linkage: " + linkage,
)
cut = _hc_cut(k, children, n_leaves)
cut_ = _hc_cut(k, children_, n_leaves)
assess_same_labelling(cut, cut_)
# Test error management in _hc_cut
with pytest.raises(ValueError):
_hc_cut(n_leaves + 1, children, n_leaves)
# Make sure our custom mst_linkage_core gives
# the same results as scipy's builtin
def test_vector_scikit_single_vs_scipy_single(global_random_seed):
n_samples, n_features, n_clusters = 10, 5, 3
rng = np.random.RandomState(global_random_seed)
X = 0.1 * rng.normal(size=(n_samples, n_features))
X -= 4.0 * np.arange(n_samples)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method="single")
children_scipy = out[:, :2].astype(int)
children, _, n_leaves, _ = _TREE_BUILDERS["single"](X)
# Sort the order of child nodes per row for consistency
children.sort(axis=1)
assert_array_equal(
children,
children_scipy,
"linkage tree differs from scipy impl for single linkage.",
)
cut = _hc_cut(n_clusters, children, n_leaves)
cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
assess_same_labelling(cut, cut_scipy)
@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
def test_mst_linkage_core_memory_mapped(metric_param_grid):
"""The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
Non-regression test for issue #19875.
"""
rng = np.random.RandomState(seed=1)
X = rng.normal(size=(20, 4))
Xmm = create_memmap_backed_data(X)
metric, param_grid = metric_param_grid
keys = param_grid.keys()
for vals in itertools.product(*param_grid.values()):
kwargs = dict(zip(keys, vals))
distance_metric = DistanceMetric.get_metric(metric, **kwargs)
mst = mst_linkage_core(X, distance_metric)
mst_mm = mst_linkage_core(Xmm, distance_metric)
np.testing.assert_equal(mst, mst_mm)
def test_identical_points():
# Ensure identical points are handled correctly when using mst with
# a sparse connectivity matrix
X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]])
true_labels = np.array([0, 0, 1, 1, 2, 2])
connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
connectivity, n_components = _fix_connectivity(X, connectivity, "euclidean")
for linkage in ("single", "average", "average", "ward"):
clustering = AgglomerativeClustering(
n_clusters=3, linkage=linkage, connectivity=connectivity
)
clustering.fit(X)
assert_almost_equal(
normalized_mutual_info_score(clustering.labels_, true_labels), 1
)
def test_connectivity_propagation():
# Check that connectivity in the ward tree is propagated correctly during
# merging.
X = np.array(
[
(0.014, 0.120),
(0.014, 0.099),
(0.014, 0.097),
(0.017, 0.153),
(0.017, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.153),
(0.018, 0.152),
(0.018, 0.149),
(0.018, 0.144),
]
)
connectivity = kneighbors_graph(X, 10, include_self=False)
ward = AgglomerativeClustering(
n_clusters=4, connectivity=connectivity, linkage="ward"
)
# If changes are not propagated correctly, fit crashes with an
# IndexError
ward.fit(X)
def test_ward_tree_children_order(global_random_seed):
# Check that children are ordered in the same way for both structured and
# unstructured versions of ward_tree.
# test on five random datasets
n, p = 10, 5
rng = np.random.RandomState(global_random_seed)
connectivity = np.ones((n, n))
for i in range(5):
X = 0.1 * rng.normal(size=(n, p))
X -= 4.0 * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X)
out_structured = ward_tree(X, connectivity=connectivity)
assert_array_equal(out_unstructured[0], out_structured[0])
def test_ward_linkage_tree_return_distance(global_random_seed):
# Test return_distance option on linkage and ward trees
# test that return_distance when set true, gives same
# output on both structured and unstructured clustering.
n, p = 10, 5
rng = np.random.RandomState(global_random_seed)
connectivity = np.ones((n, n))
for i in range(5):
X = 0.1 * rng.normal(size=(n, p))
X -= 4.0 * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X, return_distance=True)
out_structured = ward_tree(X, connectivity=connectivity, return_distance=True)
# get children
children_unstructured = out_unstructured[0]
children_structured = out_structured[0]
# check if we got the same clusters
assert_array_equal(children_unstructured, children_structured)
# check if the distances are the same
dist_unstructured = out_unstructured[-1]
dist_structured = out_structured[-1]
assert_array_almost_equal(dist_unstructured, dist_structured)
for linkage in ["average", "complete", "single"]:
structured_items = linkage_tree(
X, connectivity=connectivity, linkage=linkage, return_distance=True
)[-1]
unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[
-1
]
structured_dist = structured_items[-1]
unstructured_dist = unstructured_items[-1]
structured_children = structured_items[0]
unstructured_children = unstructured_items[0]
assert_array_almost_equal(structured_dist, unstructured_dist)
assert_array_almost_equal(structured_children, unstructured_children)
# test on the following dataset where we know the truth
# taken from scipy/cluster/tests/hierarchy_test_data.py
X = np.array(
[
[1.43054825, -7.5693489],
[6.95887839, 6.82293382],
[2.87137846, -9.68248579],
[7.87974764, -6.05485803],
[8.24018364, -6.09495602],
[7.39020262, 8.54004355],
]
)
# truth
linkage_X_ward = np.array(
[
[3.0, 4.0, 0.36265956, 2.0],
[1.0, 5.0, 1.77045373, 2.0],
[0.0, 2.0, 2.55760419, 2.0],
[6.0, 8.0, 9.10208346, 4.0],
[7.0, 9.0, 24.7784379, 6.0],
]
)
linkage_X_complete = np.array(
[
[3.0, 4.0, 0.36265956, 2.0],
[1.0, 5.0, 1.77045373, 2.0],
[0.0, 2.0, 2.55760419, 2.0],
[6.0, 8.0, 6.96742194, 4.0],
[7.0, 9.0, 18.77445997, 6.0],
]
)
linkage_X_average = np.array(
[
[3.0, 4.0, 0.36265956, 2.0],
[1.0, 5.0, 1.77045373, 2.0],
[0.0, 2.0, 2.55760419, 2.0],
[6.0, 8.0, 6.55832839, 4.0],
[7.0, 9.0, 15.44089605, 6.0],
]
)
n_samples, n_features = np.shape(X)
connectivity_X = np.ones((n_samples, n_samples))
out_X_unstructured = ward_tree(X, return_distance=True)
out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True)
# check that the labels are the same
assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])
# check that the distances are correct
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
linkage_options = ["complete", "average", "single"]
X_linkage_truth = [linkage_X_complete, linkage_X_average]
for linkage, X_truth in zip(linkage_options, X_linkage_truth):
out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage)
out_X_structured = linkage_tree(
X, connectivity=connectivity_X, linkage=linkage, return_distance=True
)
# check that the labels are the same
assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
assert_array_equal(X_truth[:, :2], out_X_structured[0])
# check that the distances are correct
assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
def test_connectivity_fixing_non_lil():
# Check non regression of a bug if a non item assignable connectivity is
# provided with more than one component.
# create dummy data
x = np.array([[0, 0], [1, 1]])
# create a mask with several components to force connectivity fixing
m = np.array([[True, False], [False, True]])
c = grid_to_graph(n_x=2, n_y=2, mask=m)
w = AgglomerativeClustering(connectivity=c, linkage="ward")
with pytest.warns(UserWarning):
w.fit(x)
def test_int_float_dict():
rng = np.random.RandomState(0)
keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
values = rng.rand(len(keys))
d = IntFloatDict(keys, values)
for key, value in zip(keys, values):
assert d[key] == value
other_keys = np.arange(50, dtype=np.intp)[::2]
other_values = np.full(50, 0.5)[::2]
other = IntFloatDict(other_keys, other_values)
# Complete smoke test
max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
def test_connectivity_callable():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(
connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)
)
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_ignores_diagonal():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_compute_full_tree():
# Test that the full tree is computed if n_clusters is small
rng = np.random.RandomState(0)
X = rng.randn(10, 2)
connectivity = kneighbors_graph(X, 5, include_self=False)
# When n_clusters is less, the full tree should be built
# that is the number of merges should be n_samples - 1
agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
agc.fit(X)
n_samples = X.shape[0]
n_nodes = agc.children_.shape[0]
assert n_nodes == n_samples - 1
# When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
# we should stop when there are n_clusters.
n_clusters = 101
X = rng.randn(200, 2)
connectivity = kneighbors_graph(X, 10, include_self=False)
agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity)
agc.fit(X)
n_samples = X.shape[0]
n_nodes = agc.children_.shape[0]
assert n_nodes == n_samples - n_clusters
def test_n_components():
# Test n_components returned by linkage, average and ward tree
rng = np.random.RandomState(0)
X = rng.rand(5, 5)
# Connectivity matrix having five components.
connectivity = np.eye(5)
for linkage_func in _TREE_BUILDERS.values():
assert ignore_warnings(linkage_func)(X, connectivity=connectivity)[1] == 5
def test_affinity_passed_to_fix_connectivity():
# Test that the affinity parameter is actually passed to the pairwise
# function
size = 2
rng = np.random.RandomState(0)
X = rng.randn(size, size)
mask = np.array([True, False, False, True])
connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
class FakeAffinity:
def __init__(self):
self.counter = 0
def increment(self, *args, **kwargs):
self.counter += 1
return self.counter
fa = FakeAffinity()
linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
assert fa.counter == 3
@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
def test_agglomerative_clustering_with_distance_threshold(linkage, global_random_seed):
# Check that we obtain the correct number of clusters with
# agglomerative clustering with distance_threshold.
rng = np.random.RandomState(global_random_seed)
mask = np.ones([10, 10], dtype=bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
# test when distance threshold is set to 10
distance_threshold = 10
for conn in [None, connectivity]:
clustering = AgglomerativeClustering(
n_clusters=None,
distance_threshold=distance_threshold,
connectivity=conn,
linkage=linkage,
)
clustering.fit(X)
clusters_produced = clustering.labels_
num_clusters_produced = len(np.unique(clustering.labels_))
# test if the clusters produced match the point in the linkage tree
# where the distance exceeds the threshold
tree_builder = _TREE_BUILDERS[linkage]
children, n_components, n_leaves, parent, distances = tree_builder(
X, connectivity=conn, n_clusters=None, return_distance=True
)
num_clusters_at_threshold = (
np.count_nonzero(distances >= distance_threshold) + 1
)
# test number of clusters produced
assert num_clusters_at_threshold == num_clusters_produced
# test clusters produced
clusters_at_threshold = _hc_cut(
n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves
)
assert np.array_equiv(clusters_produced, clusters_at_threshold)
def test_small_distance_threshold(global_random_seed):
rng = np.random.RandomState(global_random_seed)
n_samples = 10
X = rng.randint(-300, 300, size=(n_samples, 3))
# this should result in all data in their own clusters, given that
# their pairwise distances are bigger than .1 (which may not be the case
# with a different random seed).
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=1.0, linkage="single"
).fit(X)
# check that the pairwise distances are indeed all larger than .1
all_distances = pairwise_distances(X, metric="minkowski", p=2)
np.fill_diagonal(all_distances, np.inf)
assert np.all(all_distances > 0.1)
assert clustering.n_clusters_ == n_samples
def test_cluster_distances_with_distance_threshold(global_random_seed):
rng = np.random.RandomState(global_random_seed)
n_samples = 100
X = rng.randint(-10, 10, size=(n_samples, 3))
# check the distances within the clusters and with other clusters
distance_threshold = 4
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=distance_threshold, linkage="single"
).fit(X)
labels = clustering.labels_
D = pairwise_distances(X, metric="minkowski", p=2)
# to avoid taking the 0 diagonal in min()
np.fill_diagonal(D, np.inf)
for label in np.unique(labels):
in_cluster_mask = labels == label
max_in_cluster_distance = (
D[in_cluster_mask][:, in_cluster_mask].min(axis=0).max()
)
min_out_cluster_distance = (
D[in_cluster_mask][:, ~in_cluster_mask].min(axis=0).min()
)
# single data point clusters only have that inf diagonal here
if in_cluster_mask.sum() > 1:
assert max_in_cluster_distance < distance_threshold
assert min_out_cluster_distance >= distance_threshold
@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
@pytest.mark.parametrize(
("threshold", "y_true"), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])]
)
def test_agglomerative_clustering_with_distance_threshold_edge_case(
linkage, threshold, y_true
):
# test boundary case of distance_threshold matching the distance
X = [[0], [1]]
clusterer = AgglomerativeClustering(
n_clusters=None, distance_threshold=threshold, linkage=linkage
)
y_pred = clusterer.fit_predict(X)
assert adjusted_rand_score(y_true, y_pred) == 1
def test_dist_threshold_invalid_parameters():
X = [[0], [1]]
with pytest.raises(ValueError, match="Exactly one of "):
AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X)
with pytest.raises(ValueError, match="Exactly one of "):
AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X)
X = [[0], [1]]
with pytest.raises(ValueError, match="compute_full_tree must be True if"):
AgglomerativeClustering(
n_clusters=None, distance_threshold=1, compute_full_tree=False
).fit(X)
def test_invalid_shape_precomputed_dist_matrix():
# Check that an error is raised when affinity='precomputed'
# and a non square matrix is passed (PR #16257).
rng = np.random.RandomState(0)
X = rng.rand(5, 3)
with pytest.raises(
ValueError,
match=r"Distance matrix should be square, got matrix of shape \(5, 3\)",
):
AgglomerativeClustering(metric="precomputed", linkage="complete").fit(X)
def test_precomputed_connectivity_metric_with_2_connected_components():
"""Check that connecting components works when connectivity and
affinity are both precomputed and the number of connected components is
greater than 1. Non-regression test for #16151.
"""
connectivity_matrix = np.array(
[
[0, 1, 1, 0, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 1],
[0, 0, 0, 0, 0],
]
)
# ensure that connectivity_matrix has two connected components
assert connected_components(connectivity_matrix)[0] == 2
rng = np.random.RandomState(0)
X = rng.randn(5, 10)
X_dist = pairwise_distances(X)
clusterer_precomputed = AgglomerativeClustering(
metric="precomputed", connectivity=connectivity_matrix, linkage="complete"
)
msg = "Completing it to avoid stopping the tree early"
with pytest.warns(UserWarning, match=msg):
clusterer_precomputed.fit(X_dist)
clusterer = AgglomerativeClustering(
connectivity=connectivity_matrix, linkage="complete"
)
with pytest.warns(UserWarning, match=msg):
clusterer.fit(X)
assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_)
assert_array_equal(clusterer.children_, clusterer_precomputed.children_)
# TODO(1.6): remove in 1.6
@pytest.mark.parametrize(
"Agglomeration", [AgglomerativeClustering, FeatureAgglomeration]
)
def test_deprecation_warning_metric_None(Agglomeration):
X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
warn_msg = "`metric=None` is deprecated in version 1.4 and will be removed"
with pytest.warns(FutureWarning, match=warn_msg):
Agglomeration(metric=None).fit(X)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,215 @@
"""
Testing for mean shift clustering methods
"""
import warnings
import numpy as np
import pytest
from sklearn.cluster import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift
from sklearn.datasets import make_blobs
from sklearn.metrics import v_measure_score
from sklearn.utils._testing import assert_allclose, assert_array_equal
n_clusters = 3
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(
n_samples=300,
n_features=2,
centers=centers,
cluster_std=0.4,
shuffle=True,
random_state=11,
)
def test_convergence_of_1d_constant_data():
# Test convergence using 1D constant data
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/28926
model = MeanShift()
n_iter = model.fit(np.ones(10).reshape(-1, 1)).n_iter_
assert n_iter < model.max_iter
def test_estimate_bandwidth():
# Test estimate_bandwidth
bandwidth = estimate_bandwidth(X, n_samples=200)
assert 0.9 <= bandwidth <= 1.5
def test_estimate_bandwidth_1sample(global_dtype):
# Test estimate_bandwidth when n_samples=1 and quantile<1, so that
# n_neighbors is set to 1.
bandwidth = estimate_bandwidth(
X.astype(global_dtype, copy=False), n_samples=1, quantile=0.3
)
assert bandwidth.dtype == X.dtype
assert bandwidth == pytest.approx(0.0, abs=1e-5)
@pytest.mark.parametrize(
"bandwidth, cluster_all, expected, first_cluster_label",
[(1.2, True, 3, 0), (1.2, False, 4, -1)],
)
def test_mean_shift(
global_dtype, bandwidth, cluster_all, expected, first_cluster_label
):
# Test MeanShift algorithm
X_with_global_dtype = X.astype(global_dtype, copy=False)
ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all)
labels = ms.fit(X_with_global_dtype).labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
assert n_clusters_ == expected
assert labels_unique[0] == first_cluster_label
assert ms.cluster_centers_.dtype == global_dtype
cluster_centers, labels_mean_shift = mean_shift(
X_with_global_dtype, cluster_all=cluster_all
)
labels_mean_shift_unique = np.unique(labels_mean_shift)
n_clusters_mean_shift = len(labels_mean_shift_unique)
assert n_clusters_mean_shift == expected
assert labels_mean_shift_unique[0] == first_cluster_label
assert cluster_centers.dtype == global_dtype
def test_parallel(global_dtype):
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(
n_samples=50,
n_features=2,
centers=centers,
cluster_std=0.4,
shuffle=True,
random_state=11,
)
X = X.astype(global_dtype, copy=False)
ms1 = MeanShift(n_jobs=2)
ms1.fit(X)
ms2 = MeanShift()
ms2.fit(X)
assert_allclose(ms1.cluster_centers_, ms2.cluster_centers_)
assert ms1.cluster_centers_.dtype == ms2.cluster_centers_.dtype
assert_array_equal(ms1.labels_, ms2.labels_)
def test_meanshift_predict(global_dtype):
# Test MeanShift.predict
ms = MeanShift(bandwidth=1.2)
X_with_global_dtype = X.astype(global_dtype, copy=False)
labels = ms.fit_predict(X_with_global_dtype)
labels2 = ms.predict(X_with_global_dtype)
assert_array_equal(labels, labels2)
def test_meanshift_all_orphans():
# init away from the data, crash with a sensible warning
ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
msg = "No point was within bandwidth=0.1"
with pytest.raises(ValueError, match=msg):
ms.fit(
X,
)
def test_unfitted():
# Non-regression: before fit, there should be not fitted attributes.
ms = MeanShift()
assert not hasattr(ms, "cluster_centers_")
assert not hasattr(ms, "labels_")
def test_cluster_intensity_tie(global_dtype):
X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=global_dtype)
c1 = MeanShift(bandwidth=2).fit(X)
X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=global_dtype)
c2 = MeanShift(bandwidth=2).fit(X)
assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0])
assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
def test_bin_seeds(global_dtype):
# Test the bin seeding technique which can be used in the mean shift
# algorithm
# Data is just 6 points in the plane
X = np.array(
[[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]],
dtype=global_dtype,
)
# With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
# found
ground_truth = {(1.0, 1.0), (2.0, 1.0), (0.0, 0.0)}
test_bins = get_bin_seeds(X, 1, 1)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
# found
ground_truth = {(1.0, 1.0), (2.0, 1.0)}
test_bins = get_bin_seeds(X, 1, 2)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
# we bail and use the whole data here.
with warnings.catch_warnings(record=True):
test_bins = get_bin_seeds(X, 0.01, 1)
assert_allclose(test_bins, X)
# tight clusters around [0, 0] and [1, 1], only get two bins
X, _ = make_blobs(
n_samples=100,
n_features=2,
centers=[[0, 0], [1, 1]],
cluster_std=0.1,
random_state=0,
)
X = X.astype(global_dtype, copy=False)
test_bins = get_bin_seeds(X, 1)
assert_array_equal(test_bins, [[0, 0], [1, 1]])
@pytest.mark.parametrize("max_iter", [1, 100])
def test_max_iter(max_iter):
clusters1, _ = mean_shift(X, max_iter=max_iter)
ms = MeanShift(max_iter=max_iter).fit(X)
clusters2 = ms.cluster_centers_
assert ms.n_iter_ <= ms.max_iter
assert len(clusters1) == len(clusters2)
for c1, c2 in zip(clusters1, clusters2):
assert np.allclose(c1, c2)
def test_mean_shift_zero_bandwidth(global_dtype):
# Check that mean shift works when the estimated bandwidth is 0.
X = np.array([1, 1, 1, 2, 2, 2, 3, 3], dtype=global_dtype).reshape(-1, 1)
# estimate_bandwidth with default args returns 0 on this dataset
bandwidth = estimate_bandwidth(X)
assert bandwidth == 0
# get_bin_seeds with a 0 bin_size should return the dataset itself
assert get_bin_seeds(X, bin_size=bandwidth) is X
# MeanShift with binning and a 0 estimated bandwidth should be equivalent
# to no binning.
ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X)
ms_nobinning = MeanShift(bin_seeding=False).fit(X)
expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2])
assert v_measure_score(ms_binning.labels_, expected_labels) == pytest.approx(1)
assert v_measure_score(ms_nobinning.labels_, expected_labels) == pytest.approx(1)
assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)
@@ -0,0 +1,858 @@
# Authors: Shane Grigsby <refuge@rocktalus.com>
# Adrin Jalali <adrin.jalali@gmail.com>
# License: BSD 3 clause
import warnings
import numpy as np
import pytest
from sklearn.cluster import DBSCAN, OPTICS
from sklearn.cluster._optics import _extend_region, _extract_xi_labels
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.datasets import make_blobs
from sklearn.exceptions import DataConversionWarning, EfficiencyWarning
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils import shuffle
from sklearn.utils._testing import assert_allclose, assert_array_equal
from sklearn.utils.fixes import CSR_CONTAINERS
rng = np.random.RandomState(0)
n_points_per_cluster = 10
C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))
@pytest.mark.parametrize(
("r_plot", "end"),
[
[[10, 8.9, 8.8, 8.7, 7, 10], 3],
[[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0],
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
],
)
def test_extend_downward(r_plot, end):
r_plot = np.array(r_plot)
ratio = r_plot[:-1] / r_plot[1:]
steep_downward = ratio >= 1 / 0.9
upward = ratio < 1
e = _extend_region(steep_downward, upward, 0, 2)
assert e == end
@pytest.mark.parametrize(
("r_plot", "end"),
[
[[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6],
[[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0],
[[1, 2, 2.1, 2, np.inf], 0],
[[1, 2, 2.1, np.inf], 2],
],
)
def test_extend_upward(r_plot, end):
r_plot = np.array(r_plot)
ratio = r_plot[:-1] / r_plot[1:]
steep_upward = ratio <= 0.9
downward = ratio > 1
e = _extend_region(steep_upward, downward, 0, 2)
assert e == end
@pytest.mark.parametrize(
("ordering", "clusters", "expected"),
[
[[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]],
[[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]],
[[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]],
[[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]],
],
)
def test_the_extract_xi_labels(ordering, clusters, expected):
labels = _extract_xi_labels(ordering, clusters)
assert_array_equal(labels, expected)
def test_extract_xi(global_dtype):
# small and easy test (no clusters around other clusters)
# but with a clear noise data.
rng = np.random.RandomState(0)
n_points_per_cluster = 5
C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2)
C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 0.6 * rng.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 0.2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)).astype(
global_dtype, copy=False
)
expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(
min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4
).fit(X)
assert_array_equal(clust.labels_, expected_labels)
# check float min_samples and min_cluster_size
clust = OPTICS(
min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4
).fit(X)
assert_array_equal(clust.labels_, expected_labels)
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)).astype(
global_dtype, copy=False
)
expected_labels = np.r_[
[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5
]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(
min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3
).fit(X)
# this may fail if the predecessor correction is not at work!
assert_array_equal(clust.labels_, expected_labels)
C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]]
C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(
min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04
).fit(X)
assert_array_equal(clust.labels_, expected_labels)
def test_cluster_hierarchy_(global_dtype):
rng = np.random.RandomState(0)
n_points_per_cluster = 100
C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2).astype(
global_dtype, copy=False
)
C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2).astype(
global_dtype, copy=False
)
X = np.vstack((C1, C2))
X = shuffle(X, random_state=0)
clusters = OPTICS(min_samples=20, xi=0.1).fit(X).cluster_hierarchy_
assert clusters.shape == (2, 2)
diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
assert diff / len(X) < 0.05
@pytest.mark.parametrize(
"csr_container, metric",
[(None, "minkowski")] + [(container, "euclidean") for container in CSR_CONTAINERS],
)
def test_correct_number_of_clusters(metric, csr_container):
# in 'auto' mode
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
# Parameters chosen specifically for this task.
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric)
clust.fit(csr_container(X) if csr_container is not None else X)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
assert n_clusters_1 == n_clusters
# check attribute types and sizes
assert clust.labels_.shape == (len(X),)
assert clust.labels_.dtype.kind == "i"
assert clust.reachability_.shape == (len(X),)
assert clust.reachability_.dtype.kind == "f"
assert clust.core_distances_.shape == (len(X),)
assert clust.core_distances_.dtype.kind == "f"
assert clust.ordering_.shape == (len(X),)
assert clust.ordering_.dtype.kind == "i"
assert set(clust.ordering_) == set(range(len(X)))
def test_minimum_number_of_sample_check():
# test that we check a minimum number of samples
msg = "min_samples must be no greater than"
# Compute OPTICS
X = [[1, 1]]
clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1.0)
# Run the fit
with pytest.raises(ValueError, match=msg):
clust.fit(X)
def test_bad_extract():
# Test an extraction of eps too close to original eps
msg = "Specify an epsilon smaller than 0.15. Got 0.3."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10)
with pytest.raises(ValueError, match=msg):
clust.fit(X)
def test_bad_reachability():
msg = "All reachability values are inf. Set a larger max_eps."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)
with pytest.warns(UserWarning, match=msg):
clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
clust.fit(X)
def test_nowarn_if_metric_bool_data_bool():
# make sure no warning is raised if metric and data are both boolean
# non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/18996
pairwise_metric = "rogerstanimoto"
X = np.random.randint(2, size=(5, 2), dtype=bool)
with warnings.catch_warnings():
warnings.simplefilter("error", DataConversionWarning)
OPTICS(metric=pairwise_metric).fit(X)
def test_warn_if_metric_bool_data_no_bool():
# make sure a *single* conversion warning is raised if metric is boolean
# but data isn't
# non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/18996
pairwise_metric = "rogerstanimoto"
X = np.random.randint(2, size=(5, 2), dtype=np.int32)
msg = f"Data will be converted to boolean for metric {pairwise_metric}"
with pytest.warns(DataConversionWarning, match=msg) as warn_record:
OPTICS(metric=pairwise_metric).fit(X)
assert len(warn_record) == 1
def test_nowarn_if_metric_no_bool():
# make sure no conversion warning is raised if
# metric isn't boolean, no matter what the data type is
pairwise_metric = "minkowski"
X_bool = np.random.randint(2, size=(5, 2), dtype=bool)
X_num = np.random.randint(2, size=(5, 2), dtype=np.int32)
with warnings.catch_warnings():
warnings.simplefilter("error", DataConversionWarning)
# fit boolean data
OPTICS(metric=pairwise_metric).fit(X_bool)
# fit numeric data
OPTICS(metric=pairwise_metric).fit(X_num)
def test_close_extract():
# Test extract where extraction eps is close to scaled max_eps
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)
# Compute OPTICS
clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X)
# Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
assert max(clust.labels_) == 2
@pytest.mark.parametrize("eps", [0.1, 0.3, 0.5])
@pytest.mark.parametrize("min_samples", [3, 10, 20])
@pytest.mark.parametrize(
"csr_container, metric",
[(None, "minkowski"), (None, "euclidean")]
+ [(container, "euclidean") for container in CSR_CONTAINERS],
)
def test_dbscan_optics_parity(eps, min_samples, metric, global_dtype, csr_container):
# Test that OPTICS clustering labels are <= 5% difference of DBSCAN
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=150, centers=centers, cluster_std=0.4, random_state=0
)
X = csr_container(X) if csr_container is not None else X
X = X.astype(global_dtype, copy=False)
# calculate optics with dbscan extract at 0.3 epsilon
op = OPTICS(
min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric
).fit(X)
# calculate dbscan labels
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
contingency = contingency_matrix(db.labels_, op.labels_)
agree = min(
np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))
)
disagree = X.shape[0] - agree
percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
# verify label mismatch is <= 5% labels
assert percent_mismatch <= 0.05
def test_min_samples_edge_case(global_dtype):
C1 = [[0, 0], [0, 0.1], [0, -0.1]]
C2 = [[10, 10], [10, 9], [10, 11]]
C3 = [[100, 100], [100, 96], [100, 106]]
X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False)
expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
expected_labels = np.r_[[-1] * 9]
with pytest.warns(UserWarning, match="All reachability values"):
clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
# try arbitrary minimum sizes
@pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23))
def test_min_cluster_size(min_cluster_size, global_dtype):
redX = X[::2].astype(global_dtype, copy=False) # reduce for speed
clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
if cluster_sizes.size:
assert min(cluster_sizes) >= min_cluster_size
# check behaviour is the same when min_cluster_size is a fraction
clust_frac = OPTICS(
min_samples=9,
min_cluster_size=min_cluster_size / redX.shape[0],
)
clust_frac.fit(redX)
assert_array_equal(clust.labels_, clust_frac.labels_)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_min_cluster_size_invalid2(csr_container):
clust = OPTICS(min_cluster_size=len(X) + 1)
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(X)
clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean")
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(csr_container(X))
def test_processing_order():
# Ensure that we consider all unprocessed points,
# not only direct neighbors. when picking the next point.
Y = [[0], [10], [-10], [25]]
clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
assert_array_equal(clust.ordering_, [0, 1, 2, 3])
def test_compare_to_ELKI():
# Expected values, computed with (future) ELKI 0.7.5 using:
# java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
# -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
# where the FixedDBIDsFilter gives 0-indexed ids.
r1 = [
np.inf,
1.0574896366427478,
0.7587934993548423,
0.7290174038973836,
0.7290174038973836,
0.7290174038973836,
0.6861627576116127,
0.7587934993548423,
0.9280118450166668,
1.1748022534146194,
3.3355455741292257,
0.49618389254482587,
0.2552805046961355,
0.2552805046961355,
0.24944622248445714,
0.24944622248445714,
0.24944622248445714,
0.2552805046961355,
0.2552805046961355,
0.3086779122185853,
4.163024452756142,
1.623152630340929,
0.45315840475822655,
0.25468325192031926,
0.2254004358159971,
0.18765711877083036,
0.1821471333893275,
0.1821471333893275,
0.18765711877083036,
0.18765711877083036,
0.2240202988740153,
1.154337614548715,
1.342604473837069,
1.323308536402633,
0.8607514948648837,
0.27219111215810565,
0.13260875220533205,
0.13260875220533205,
0.09890587675958984,
0.09890587675958984,
0.13548790801634494,
0.1575483940837384,
0.17515137170530226,
0.17575920159442388,
0.27219111215810565,
0.6101447895405373,
1.3189208094864302,
1.323308536402633,
2.2509184159764577,
2.4517810628594527,
3.675977064404973,
3.8264795626020365,
2.9130735341510614,
2.9130735341510614,
2.9130735341510614,
2.9130735341510614,
2.8459300127258036,
2.8459300127258036,
2.8459300127258036,
3.0321982337972537,
]
o1 = [
0,
3,
6,
4,
7,
8,
2,
9,
5,
1,
31,
30,
32,
34,
33,
38,
39,
35,
37,
36,
44,
21,
23,
24,
22,
25,
27,
29,
26,
28,
20,
40,
45,
46,
10,
15,
11,
13,
17,
19,
18,
12,
16,
14,
47,
49,
43,
48,
42,
41,
53,
57,
51,
52,
56,
59,
54,
55,
58,
50,
]
p1 = [
-1,
0,
3,
6,
6,
6,
8,
3,
7,
5,
1,
31,
30,
30,
34,
34,
34,
32,
32,
37,
36,
44,
21,
23,
24,
22,
25,
25,
22,
22,
22,
21,
40,
45,
46,
10,
15,
15,
13,
13,
15,
11,
19,
15,
10,
47,
12,
45,
14,
43,
42,
53,
57,
57,
57,
57,
59,
59,
59,
58,
]
# Tests against known extraction array
# Does NOT work with metric='euclidean', because sklearn euclidean has
# worse numeric precision. 'minkowski' is slower but more accurate.
clust1 = OPTICS(min_samples=5).fit(X)
assert_array_equal(clust1.ordering_, np.array(o1))
assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1))
# ELKI currently does not print the core distances (which are not used much
# in literature, but we can at least ensure to have this consistency:
for i in clust1.ordering_[1:]:
assert clust1.reachability_[i] >= clust1.core_distances_[clust1.predecessor_[i]]
# Expected values, computed with (future) ELKI 0.7.5 using
r2 = [
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
0.27219111215810565,
0.13260875220533205,
0.13260875220533205,
0.09890587675958984,
0.09890587675958984,
0.13548790801634494,
0.1575483940837384,
0.17515137170530226,
0.17575920159442388,
0.27219111215810565,
0.4928068613197889,
np.inf,
0.2666183922512113,
0.18765711877083036,
0.1821471333893275,
0.1821471333893275,
0.1821471333893275,
0.18715928772277457,
0.18765711877083036,
0.18765711877083036,
0.25468325192031926,
np.inf,
0.2552805046961355,
0.2552805046961355,
0.24944622248445714,
0.24944622248445714,
0.24944622248445714,
0.2552805046961355,
0.2552805046961355,
0.3086779122185853,
0.34466409325984865,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
np.inf,
]
o2 = [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
15,
11,
13,
17,
19,
18,
12,
16,
14,
47,
46,
20,
22,
25,
23,
27,
29,
24,
26,
28,
21,
30,
32,
34,
33,
38,
39,
35,
37,
36,
31,
40,
41,
42,
43,
44,
45,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
]
p2 = [
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
10,
15,
15,
13,
13,
15,
11,
19,
15,
10,
47,
-1,
20,
22,
25,
25,
25,
25,
22,
22,
23,
-1,
30,
30,
34,
34,
34,
32,
32,
37,
38,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
]
clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
assert_array_equal(clust2.ordering_, np.array(o2))
assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))
assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2))
index = np.where(clust1.core_distances_ <= 0.5)[0]
assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index])
def test_extract_dbscan(global_dtype):
# testing an easy dbscan case. Not including clusters with different
# densities.
rng = np.random.RandomState(0)
n_points_per_cluster = 20
C1 = [-5, -2] + 0.2 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + 0.2 * rng.randn(n_points_per_cluster, 2)
C3 = [1, 2] + 0.2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + 0.2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4)).astype(global_dtype, copy=False)
clust = OPTICS(cluster_method="dbscan", eps=0.5).fit(X)
assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])
@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS)
def test_precomputed_dists(global_dtype, csr_container):
redX = X[::2].astype(global_dtype, copy=False)
dists = pairwise_distances(redX, metric="euclidean")
dists = csr_container(dists) if csr_container is not None else dists
with warnings.catch_warnings():
warnings.simplefilter("ignore", EfficiencyWarning)
clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(
dists
)
clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX)
assert_allclose(clust1.reachability_, clust2.reachability_)
assert_array_equal(clust1.labels_, clust2.labels_)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_optics_input_not_modified_precomputed_sparse_nodiag(csr_container):
"""Check that we don't modify in-place the pre-computed sparse matrix.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27508
"""
X = np.random.RandomState(0).rand(6, 6)
# Add zeros on the diagonal that will be implicit when creating
# the sparse matrix. If `X` is modified in-place, the zeros from
# the diagonal will be made explicit.
np.fill_diagonal(X, 0)
X = csr_container(X)
assert all(row != col for row, col in zip(*X.nonzero()))
X_copy = X.copy()
OPTICS(metric="precomputed").fit(X)
# Make sure that we did not modify `X` in-place even by creating
# explicit 0s values.
assert X.nnz == X_copy.nnz
assert_array_equal(X.toarray(), X_copy.toarray())
def test_optics_predecessor_correction_ordering():
"""Check that cluster correction using predecessor is working as expected.
In the following example, the predecessor correction was not working properly
since it was not using the right indices.
This non-regression test check that reordering the data does not change the results.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/26324
"""
X_1 = np.array([1, 2, 3, 1, 8, 8, 7, 100]).reshape(-1, 1)
reorder = [0, 1, 2, 4, 5, 6, 7, 3]
X_2 = X_1[reorder]
optics_1 = OPTICS(min_samples=3, metric="euclidean").fit(X_1)
optics_2 = OPTICS(min_samples=3, metric="euclidean").fit(X_2)
assert_array_equal(optics_1.labels_[reorder], optics_2.labels_)
@@ -0,0 +1,335 @@
"""Testing for Spectral Clustering methods"""
import pickle
import re
import numpy as np
import pytest
from scipy.linalg import LinAlgError
from sklearn.cluster import SpectralClustering, spectral_clustering
from sklearn.cluster._spectral import cluster_qr, discretize
from sklearn.datasets import make_blobs
from sklearn.feature_extraction import img_to_graph
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS
try:
from pyamg import smoothed_aggregation_solver # noqa
amg_loaded = True
except ImportError:
amg_loaded = False
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(
n_samples=60,
n_features=2,
centers=centers,
cluster_std=0.4,
shuffle=True,
random_state=0,
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg"))
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
def test_spectral_clustering(eigen_solver, assign_labels, csr_container):
S = np.array(
[
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
]
)
for mat in (S, csr_container(S)):
model = SpectralClustering(
random_state=0,
n_clusters=2,
affinity="precomputed",
eigen_solver=eigen_solver,
assign_labels=assign_labels,
).fit(mat)
labels = model.labels_
if labels[0] == 0:
labels = 1 - labels
assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1
model_copy = pickle.loads(pickle.dumps(model))
assert model_copy.n_clusters == model.n_clusters
assert model_copy.eigen_solver == model.eigen_solver
assert_array_equal(model_copy.labels_, model.labels_)
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
def test_spectral_clustering_sparse(assign_labels, coo_container):
X, y = make_blobs(
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
S = rbf_kernel(X, gamma=1)
S = np.maximum(S - 1e-4, 0)
S = coo_container(S)
labels = (
SpectralClustering(
random_state=0,
n_clusters=2,
affinity="precomputed",
assign_labels=assign_labels,
)
.fit(S)
.labels_
)
assert adjusted_rand_score(y, labels) == 1
def test_precomputed_nearest_neighbors_filtering():
# Test precomputed graph filtering when containing too many neighbors
X, y = make_blobs(
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
n_neighbors = 2
results = []
for additional_neighbors in [0, 10]:
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(X)
graph = nn.kneighbors_graph(X, mode="connectivity")
labels = (
SpectralClustering(
random_state=0,
n_clusters=2,
affinity="precomputed_nearest_neighbors",
n_neighbors=n_neighbors,
)
.fit(graph)
.labels_
)
results.append(labels)
assert_array_equal(results[0], results[1])
def test_affinities():
# Note: in the following, random_state has been selected to have
# a dataset that yields a stable eigen decomposition both when built
# on OSX and Linux
X, y = make_blobs(
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
# nearest neighbors affinity
sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
with pytest.warns(UserWarning, match="not fully connected"):
sp.fit(X)
assert adjusted_rand_score(y, sp.labels_) == 1
sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
labels = sp.fit(X).labels_
assert adjusted_rand_score(y, labels) == 1
X = check_random_state(10).rand(10, 5) * 10
kernels_available = kernel_metrics()
for kern in kernels_available:
# Additive chi^2 gives a negative similarity matrix which
# doesn't make sense for spectral clustering
if kern != "additive_chi2":
sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
def histogram(x, y, **kwargs):
# Histogram kernel implemented as a callable.
assert kwargs == {} # no kernel_params that we didn't ask for
return np.minimum(x, y).sum()
sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
def test_cluster_qr():
# cluster_qr by itself should not be used for clustering generic data
# other than the rows of the eigenvectors within spectral clustering,
# but cluster_qr must still preserve the labels for different dtypes
# of the generic fixed input even if the labels may be meaningless.
random_state = np.random.RandomState(seed=8)
n_samples, n_components = 10, 5
data = random_state.randn(n_samples, n_components)
labels_float64 = cluster_qr(data.astype(np.float64))
# Each sample is assigned a cluster identifier
assert labels_float64.shape == (n_samples,)
# All components should be covered by the assignment
assert np.array_equal(np.unique(labels_float64), np.arange(n_components))
# Single precision data should yield the same cluster assignments
labels_float32 = cluster_qr(data.astype(np.float32))
assert np.array_equal(labels_float64, labels_float32)
def test_cluster_qr_permutation_invariance():
# cluster_qr must be invariant to sample permutation.
random_state = np.random.RandomState(seed=8)
n_samples, n_components = 100, 5
data = random_state.randn(n_samples, n_components)
perm = random_state.permutation(n_samples)
assert np.array_equal(
cluster_qr(data)[perm],
cluster_qr(data[perm]),
)
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
@pytest.mark.parametrize("n_samples", [50, 100, 150, 500])
def test_discretize(n_samples, coo_container):
# Test the discretize using a noise assignment matrix
random_state = np.random.RandomState(seed=8)
for n_class in range(2, 10):
# random class labels
y_true = random_state.randint(0, n_class + 1, n_samples)
y_true = np.array(y_true, float)
# noise class assignment matrix
y_indicator = coo_container(
(np.ones(n_samples), (np.arange(n_samples), y_true)),
shape=(n_samples, n_class + 1),
)
y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn(
n_samples, n_class + 1
)
y_pred = discretize(y_true_noisy, random_state=random_state)
assert adjusted_rand_score(y_true, y_pred) > 0.8
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.float
@pytest.mark.filterwarnings(
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of pinv2
@pytest.mark.filterwarnings(
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.find_common_type
@pytest.mark.filterwarnings(
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
)
def test_spectral_clustering_with_arpack_amg_solvers():
# Test that spectral_clustering is the same for arpack and amg solver
# Based on toy example from plot_segmentation_toy.py
# a small two coin image
x, y = np.indices((40, 40))
center1, center2 = (14, 12), (20, 25)
radius1, radius2 = 8, 7
circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1**2
circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2**2
circles = circle1 | circle2
mask = circles.copy()
img = circles.astype(float)
graph = img_to_graph(img, mask=mask)
graph.data = np.exp(-graph.data / graph.data.std())
labels_arpack = spectral_clustering(
graph, n_clusters=2, eigen_solver="arpack", random_state=0
)
assert len(np.unique(labels_arpack)) == 2
if amg_loaded:
labels_amg = spectral_clustering(
graph, n_clusters=2, eigen_solver="amg", random_state=0
)
assert adjusted_rand_score(labels_arpack, labels_amg) == 1
else:
with pytest.raises(ValueError):
spectral_clustering(graph, n_clusters=2, eigen_solver="amg", random_state=0)
def test_n_components():
# Test that after adding n_components, result is different and
# n_components = n_clusters by default
X, y = make_blobs(
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
sp = SpectralClustering(n_clusters=2, random_state=0)
labels = sp.fit(X).labels_
# set n_components = n_cluster and test if result is the same
labels_same_ncomp = (
SpectralClustering(n_clusters=2, n_components=2, random_state=0).fit(X).labels_
)
# test that n_components=n_clusters by default
assert_array_equal(labels, labels_same_ncomp)
# test that n_components affect result
# n_clusters=8 by default, and set n_components=2
labels_diff_ncomp = (
SpectralClustering(n_components=2, random_state=0).fit(X).labels_
)
assert not np.array_equal(labels, labels_diff_ncomp)
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
def test_verbose(assign_labels, capsys):
# Check verbose mode of KMeans for better coverage.
X, y = make_blobs(
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
)
SpectralClustering(n_clusters=2, random_state=42, verbose=1).fit(X)
captured = capsys.readouterr()
assert re.search(r"Computing label assignment using", captured.out)
if assign_labels == "kmeans":
assert re.search(r"Initialization complete", captured.out)
assert re.search(r"Iteration [0-9]+, inertia", captured.out)
def test_spectral_clustering_np_matrix_raises():
"""Check that spectral_clustering raises an informative error when passed
a np.matrix. See #10993"""
X = np.matrix([[0.0, 2.0], [2.0, 0.0]])
msg = r"np\.matrix is not supported. Please convert to a numpy array"
with pytest.raises(TypeError, match=msg):
spectral_clustering(X)
def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch):
"""Check that discretize raises LinAlgError when svd never converges.
Non-regression test for #21380
"""
def new_svd(*args, **kwargs):
raise LinAlgError()
monkeypatch.setattr(np.linalg, "svd", new_svd)
vectors = np.ones((10, 4))
with pytest.raises(LinAlgError, match="SVD did not converge"):
discretize(vectors)
@@ -0,0 +1,20 @@
"""Meta-estimators for building composite models with transformers
In addition to its current contents, this module will eventually be home to
refurbished versions of Pipeline and FeatureUnion.
"""
from ._column_transformer import (
ColumnTransformer,
make_column_selector,
make_column_transformer,
)
from ._target import TransformedTargetRegressor
__all__ = [
"ColumnTransformer",
"make_column_transformer",
"TransformedTargetRegressor",
"make_column_selector",
]
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,352 @@
# Authors: Andreas Mueller <andreas.mueller@columbia.edu>
# Guillaume Lemaitre <guillaume.lemaitre@inria.fr>
# License: BSD 3 clause
import warnings
import numpy as np
from ..base import BaseEstimator, RegressorMixin, _fit_context, clone
from ..exceptions import NotFittedError
from ..preprocessing import FunctionTransformer
from ..utils import _safe_indexing, check_array
from ..utils._param_validation import HasMethods
from ..utils._tags import _safe_tags
from ..utils.metadata_routing import (
_raise_for_unsupported_routing,
_RoutingNotSupportedMixin,
)
from ..utils.validation import check_is_fitted
__all__ = ["TransformedTargetRegressor"]
class TransformedTargetRegressor(
_RoutingNotSupportedMixin, RegressorMixin, BaseEstimator
):
"""Meta-estimator to regress on a transformed target.
Useful for applying a non-linear transformation to the target `y` in
regression problems. This transformation can be given as a Transformer
such as the :class:`~sklearn.preprocessing.QuantileTransformer` or as a
function and its inverse such as `np.log` and `np.exp`.
The computation during :meth:`fit` is::
regressor.fit(X, func(y))
or::
regressor.fit(X, transformer.transform(y))
The computation during :meth:`predict` is::
inverse_func(regressor.predict(X))
or::
transformer.inverse_transform(regressor.predict(X))
Read more in the :ref:`User Guide <transformed_target_regressor>`.
.. versionadded:: 0.20
Parameters
----------
regressor : object, default=None
Regressor object such as derived from
:class:`~sklearn.base.RegressorMixin`. This regressor will
automatically be cloned each time prior to fitting. If `regressor is
None`, :class:`~sklearn.linear_model.LinearRegression` is created and used.
transformer : object, default=None
Estimator object such as derived from
:class:`~sklearn.base.TransformerMixin`. Cannot be set at the same time
as `func` and `inverse_func`. If `transformer is None` as well as
`func` and `inverse_func`, the transformer will be an identity
transformer. Note that the transformer will be cloned during fitting.
Also, the transformer is restricting `y` to be a numpy array.
func : function, default=None
Function to apply to `y` before passing to :meth:`fit`. Cannot be set
at the same time as `transformer`. If `func is None`, the function used will be
the identity function. If `func` is set, `inverse_func` also needs to be
provided. The function needs to return a 2-dimensional array.
inverse_func : function, default=None
Function to apply to the prediction of the regressor. Cannot be set at
the same time as `transformer`. The inverse function is used to return
predictions to the same space of the original training labels. If
`inverse_func` is set, `func` also needs to be provided. The inverse
function needs to return a 2-dimensional array.
check_inverse : bool, default=True
Whether to check that `transform` followed by `inverse_transform`
or `func` followed by `inverse_func` leads to the original targets.
Attributes
----------
regressor_ : object
Fitted regressor.
transformer_ : object
Transformer used in :meth:`fit` and :meth:`predict`.
n_features_in_ : int
Number of features seen during :term:`fit`. Only defined if the
underlying regressor exposes such an attribute when fit.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
sklearn.preprocessing.FunctionTransformer : Construct a transformer from an
arbitrary callable.
Notes
-----
Internally, the target `y` is always converted into a 2-dimensional array
to be used by scikit-learn transformers. At the time of prediction, the
output will be reshaped to a have the same number of dimensions as `y`.
Examples
--------
>>> import numpy as np
>>> from sklearn.linear_model import LinearRegression
>>> from sklearn.compose import TransformedTargetRegressor
>>> tt = TransformedTargetRegressor(regressor=LinearRegression(),
... func=np.log, inverse_func=np.exp)
>>> X = np.arange(4).reshape(-1, 1)
>>> y = np.exp(2 * X).ravel()
>>> tt.fit(X, y)
TransformedTargetRegressor(...)
>>> tt.score(X, y)
1.0
>>> tt.regressor_.coef_
array([2.])
For a more detailed example use case refer to
:ref:`sphx_glr_auto_examples_compose_plot_transformed_target.py`.
"""
_parameter_constraints: dict = {
"regressor": [HasMethods(["fit", "predict"]), None],
"transformer": [HasMethods("transform"), None],
"func": [callable, None],
"inverse_func": [callable, None],
"check_inverse": ["boolean"],
}
def __init__(
self,
regressor=None,
*,
transformer=None,
func=None,
inverse_func=None,
check_inverse=True,
):
self.regressor = regressor
self.transformer = transformer
self.func = func
self.inverse_func = inverse_func
self.check_inverse = check_inverse
def _fit_transformer(self, y):
"""Check transformer and fit transformer.
Create the default transformer, fit it and make additional inverse
check on a subset (optional).
"""
if self.transformer is not None and (
self.func is not None or self.inverse_func is not None
):
raise ValueError(
"'transformer' and functions 'func'/'inverse_func' cannot both be set."
)
elif self.transformer is not None:
self.transformer_ = clone(self.transformer)
else:
if (self.func is not None and self.inverse_func is None) or (
self.func is None and self.inverse_func is not None
):
lacking_param, existing_param = (
("func", "inverse_func")
if self.func is None
else ("inverse_func", "func")
)
raise ValueError(
f"When '{existing_param}' is provided, '{lacking_param}' must also"
f" be provided. If {lacking_param} is supposed to be the default,"
" you need to explicitly pass it the identity function."
)
self.transformer_ = FunctionTransformer(
func=self.func,
inverse_func=self.inverse_func,
validate=True,
check_inverse=self.check_inverse,
)
# XXX: sample_weight is not currently passed to the
# transformer. However, if transformer starts using sample_weight, the
# code should be modified accordingly. At the time to consider the
# sample_prop feature, it is also a good use case to be considered.
self.transformer_.fit(y)
if self.check_inverse:
idx_selected = slice(None, None, max(1, y.shape[0] // 10))
y_sel = _safe_indexing(y, idx_selected)
y_sel_t = self.transformer_.transform(y_sel)
if not np.allclose(y_sel, self.transformer_.inverse_transform(y_sel_t)):
warnings.warn(
(
"The provided functions or transformer are"
" not strictly inverse of each other. If"
" you are sure you want to proceed regardless"
", set 'check_inverse=False'"
),
UserWarning,
)
@_fit_context(
# TransformedTargetRegressor.regressor/transformer are not validated yet.
prefer_skip_nested_validation=False
)
def fit(self, X, y, **fit_params):
"""Fit the model according to the given training data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : array-like of shape (n_samples,)
Target values.
**fit_params : dict
Parameters passed to the `fit` method of the underlying
regressor.
Returns
-------
self : object
Fitted estimator.
"""
_raise_for_unsupported_routing(self, "fit", **fit_params)
if y is None:
raise ValueError(
f"This {self.__class__.__name__} estimator "
"requires y to be passed, but the target y is None."
)
y = check_array(
y,
input_name="y",
accept_sparse=False,
force_all_finite=True,
ensure_2d=False,
dtype="numeric",
allow_nd=True,
)
# store the number of dimension of the target to predict an array of
# similar shape at predict
self._training_dim = y.ndim
# transformers are designed to modify X which is 2d dimensional, we
# need to modify y accordingly.
if y.ndim == 1:
y_2d = y.reshape(-1, 1)
else:
y_2d = y
self._fit_transformer(y_2d)
# transform y and convert back to 1d array if needed
y_trans = self.transformer_.transform(y_2d)
# FIXME: a FunctionTransformer can return a 1D array even when validate
# is set to True. Therefore, we need to check the number of dimension
# first.
if y_trans.ndim == 2 and y_trans.shape[1] == 1:
y_trans = y_trans.squeeze(axis=1)
if self.regressor is None:
from ..linear_model import LinearRegression
self.regressor_ = LinearRegression()
else:
self.regressor_ = clone(self.regressor)
self.regressor_.fit(X, y_trans, **fit_params)
if hasattr(self.regressor_, "feature_names_in_"):
self.feature_names_in_ = self.regressor_.feature_names_in_
return self
def predict(self, X, **predict_params):
"""Predict using the base regressor, applying inverse.
The regressor is used to predict and the `inverse_func` or
`inverse_transform` is applied before returning the prediction.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Samples.
**predict_params : dict of str -> object
Parameters passed to the `predict` method of the underlying
regressor.
Returns
-------
y_hat : ndarray of shape (n_samples,)
Predicted values.
"""
check_is_fitted(self)
pred = self.regressor_.predict(X, **predict_params)
if pred.ndim == 1:
pred_trans = self.transformer_.inverse_transform(pred.reshape(-1, 1))
else:
pred_trans = self.transformer_.inverse_transform(pred)
if (
self._training_dim == 1
and pred_trans.ndim == 2
and pred_trans.shape[1] == 1
):
pred_trans = pred_trans.squeeze(axis=1)
return pred_trans
def _more_tags(self):
regressor = self.regressor
if regressor is None:
from ..linear_model import LinearRegression
regressor = LinearRegression()
return {
"poor_score": True,
"multioutput": _safe_tags(regressor, key="multioutput"),
}
@property
def n_features_in_(self):
"""Number of features seen during :term:`fit`."""
# For consistency with other estimators we raise a AttributeError so
# that hasattr() returns False the estimator isn't fitted.
try:
check_is_fitted(self)
except NotFittedError as nfe:
raise AttributeError(
"{} object has no n_features_in_ attribute.".format(
self.__class__.__name__
)
) from nfe
return self.regressor_.n_features_in_
@@ -0,0 +1,395 @@
import numpy as np
import pytest
from sklearn import datasets
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.compose import TransformedTargetRegressor
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, OrthogonalMatchingPursuit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.utils._testing import assert_allclose, assert_no_warnings
friedman = datasets.make_friedman1(random_state=0)
def test_transform_target_regressor_error():
X, y = friedman
# provide a transformer and functions at the same time
regr = TransformedTargetRegressor(
regressor=LinearRegression(),
transformer=StandardScaler(),
func=np.exp,
inverse_func=np.log,
)
with pytest.raises(
ValueError,
match="'transformer' and functions 'func'/'inverse_func' cannot both be set.",
):
regr.fit(X, y)
# fit with sample_weight with a regressor which does not support it
sample_weight = np.ones((y.shape[0],))
regr = TransformedTargetRegressor(
regressor=OrthogonalMatchingPursuit(), transformer=StandardScaler()
)
with pytest.raises(
TypeError,
match=r"fit\(\) got an unexpected " "keyword argument 'sample_weight'",
):
regr.fit(X, y, sample_weight=sample_weight)
# one of (func, inverse_func) is given but the other one is not
regr = TransformedTargetRegressor(func=np.exp)
with pytest.raises(
ValueError,
match="When 'func' is provided, 'inverse_func' must also be provided",
):
regr.fit(X, y)
regr = TransformedTargetRegressor(inverse_func=np.log)
with pytest.raises(
ValueError,
match="When 'inverse_func' is provided, 'func' must also be provided",
):
regr.fit(X, y)
def test_transform_target_regressor_invertible():
X, y = friedman
regr = TransformedTargetRegressor(
regressor=LinearRegression(),
func=np.sqrt,
inverse_func=np.log,
check_inverse=True,
)
with pytest.warns(
UserWarning,
match=(
"The provided functions or"
" transformer are not strictly inverse of each other."
),
):
regr.fit(X, y)
regr = TransformedTargetRegressor(
regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log
)
regr.set_params(check_inverse=False)
assert_no_warnings(regr.fit, X, y)
def _check_standard_scaled(y, y_pred):
y_mean = np.mean(y, axis=0)
y_std = np.std(y, axis=0)
assert_allclose((y - y_mean) / y_std, y_pred)
def _check_shifted_by_one(y, y_pred):
assert_allclose(y + 1, y_pred)
def test_transform_target_regressor_functions():
X, y = friedman
regr = TransformedTargetRegressor(
regressor=LinearRegression(), func=np.log, inverse_func=np.exp
)
y_pred = regr.fit(X, y).predict(X)
# check the transformer output
y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze()
assert_allclose(np.log(y), y_tran)
assert_allclose(
y, regr.transformer_.inverse_transform(y_tran.reshape(-1, 1)).squeeze()
)
assert y.shape == y_pred.shape
assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
# check the regressor output
lr = LinearRegression().fit(X, regr.func(y))
assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def test_transform_target_regressor_functions_multioutput():
X = friedman[0]
y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T
regr = TransformedTargetRegressor(
regressor=LinearRegression(), func=np.log, inverse_func=np.exp
)
y_pred = regr.fit(X, y).predict(X)
# check the transformer output
y_tran = regr.transformer_.transform(y)
assert_allclose(np.log(y), y_tran)
assert_allclose(y, regr.transformer_.inverse_transform(y_tran))
assert y.shape == y_pred.shape
assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
# check the regressor output
lr = LinearRegression().fit(X, regr.func(y))
assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
@pytest.mark.parametrize(
"X,y", [friedman, (friedman[0], np.vstack((friedman[1], friedman[1] ** 2 + 1)).T)]
)
def test_transform_target_regressor_1d_transformer(X, y):
# All transformer in scikit-learn expect 2D data. FunctionTransformer with
# validate=False lift this constraint without checking that the input is a
# 2D vector. We check the consistency of the data shape using a 1D and 2D y
# array.
transformer = FunctionTransformer(
func=lambda x: x + 1, inverse_func=lambda x: x - 1
)
regr = TransformedTargetRegressor(
regressor=LinearRegression(), transformer=transformer
)
y_pred = regr.fit(X, y).predict(X)
assert y.shape == y_pred.shape
# consistency forward transform
y_tran = regr.transformer_.transform(y)
_check_shifted_by_one(y, y_tran)
assert y.shape == y_pred.shape
# consistency inverse transform
assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
# consistency of the regressor
lr = LinearRegression()
transformer2 = clone(transformer)
lr.fit(X, transformer2.fit_transform(y))
y_lr_pred = lr.predict(X)
assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
assert_allclose(regr.regressor_.coef_, lr.coef_)
@pytest.mark.parametrize(
"X,y", [friedman, (friedman[0], np.vstack((friedman[1], friedman[1] ** 2 + 1)).T)]
)
def test_transform_target_regressor_2d_transformer(X, y):
# Check consistency with transformer accepting only 2D array and a 1D/2D y
# array.
transformer = StandardScaler()
regr = TransformedTargetRegressor(
regressor=LinearRegression(), transformer=transformer
)
y_pred = regr.fit(X, y).predict(X)
assert y.shape == y_pred.shape
# consistency forward transform
if y.ndim == 1: # create a 2D array and squeeze results
y_tran = regr.transformer_.transform(y.reshape(-1, 1))
else:
y_tran = regr.transformer_.transform(y)
_check_standard_scaled(y, y_tran.squeeze())
assert y.shape == y_pred.shape
# consistency inverse transform
assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
# consistency of the regressor
lr = LinearRegression()
transformer2 = clone(transformer)
if y.ndim == 1: # create a 2D array and squeeze results
lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze())
y_lr_pred = lr.predict(X).reshape(-1, 1)
y_pred2 = transformer2.inverse_transform(y_lr_pred).squeeze()
else:
lr.fit(X, transformer2.fit_transform(y))
y_lr_pred = lr.predict(X)
y_pred2 = transformer2.inverse_transform(y_lr_pred)
assert_allclose(y_pred, y_pred2)
assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_2d_transformer_multioutput():
# Check consistency with transformer accepting only 2D array and a 2D y
# array.
X = friedman[0]
y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T
transformer = StandardScaler()
regr = TransformedTargetRegressor(
regressor=LinearRegression(), transformer=transformer
)
y_pred = regr.fit(X, y).predict(X)
assert y.shape == y_pred.shape
# consistency forward transform
y_tran = regr.transformer_.transform(y)
_check_standard_scaled(y, y_tran)
assert y.shape == y_pred.shape
# consistency inverse transform
assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
# consistency of the regressor
lr = LinearRegression()
transformer2 = clone(transformer)
lr.fit(X, transformer2.fit_transform(y))
y_lr_pred = lr.predict(X)
assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_3d_target():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/18866
# Check with a 3D target with a transformer that reshapes the target
X = friedman[0]
y = np.tile(friedman[1].reshape(-1, 1, 1), [1, 3, 2])
def flatten_data(data):
return data.reshape(data.shape[0], -1)
def unflatten_data(data):
return data.reshape(data.shape[0], -1, 2)
transformer = FunctionTransformer(func=flatten_data, inverse_func=unflatten_data)
regr = TransformedTargetRegressor(
regressor=LinearRegression(), transformer=transformer
)
y_pred = regr.fit(X, y).predict(X)
assert y.shape == y_pred.shape
def test_transform_target_regressor_multi_to_single():
X = friedman[0]
y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)])
def func(y):
out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)
return out[:, np.newaxis]
def inverse_func(y):
return y
tt = TransformedTargetRegressor(
func=func, inverse_func=inverse_func, check_inverse=False
)
tt.fit(X, y)
y_pred_2d_func = tt.predict(X)
assert y_pred_2d_func.shape == (100, 1)
# force that the function only return a 1D array
def func(y):
return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)
tt = TransformedTargetRegressor(
func=func, inverse_func=inverse_func, check_inverse=False
)
tt.fit(X, y)
y_pred_1d_func = tt.predict(X)
assert y_pred_1d_func.shape == (100, 1)
assert_allclose(y_pred_1d_func, y_pred_2d_func)
class DummyCheckerArrayTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
assert isinstance(X, np.ndarray)
return self
def transform(self, X):
assert isinstance(X, np.ndarray)
return X
def inverse_transform(self, X):
assert isinstance(X, np.ndarray)
return X
class DummyCheckerListRegressor(DummyRegressor):
def fit(self, X, y, sample_weight=None):
assert isinstance(X, list)
return super().fit(X, y, sample_weight)
def predict(self, X):
assert isinstance(X, list)
return super().predict(X)
def test_transform_target_regressor_ensure_y_array():
# check that the target ``y`` passed to the transformer will always be a
# numpy array. Similarly, if ``X`` is passed as a list, we check that the
# predictor receive as it is.
X, y = friedman
tt = TransformedTargetRegressor(
transformer=DummyCheckerArrayTransformer(),
regressor=DummyCheckerListRegressor(),
check_inverse=False,
)
tt.fit(X.tolist(), y.tolist())
tt.predict(X.tolist())
with pytest.raises(AssertionError):
tt.fit(X, y.tolist())
with pytest.raises(AssertionError):
tt.predict(X)
class DummyTransformer(TransformerMixin, BaseEstimator):
"""Dummy transformer which count how many time fit was called."""
def __init__(self, fit_counter=0):
self.fit_counter = fit_counter
def fit(self, X, y=None):
self.fit_counter += 1
return self
def transform(self, X):
return X
def inverse_transform(self, X):
return X
@pytest.mark.parametrize("check_inverse", [False, True])
def test_transform_target_regressor_count_fit(check_inverse):
# regression test for gh-issue #11618
# check that we only call a single time fit for the transformer
X, y = friedman
ttr = TransformedTargetRegressor(
transformer=DummyTransformer(), check_inverse=check_inverse
)
ttr.fit(X, y)
assert ttr.transformer_.fit_counter == 1
class DummyRegressorWithExtraFitParams(DummyRegressor):
def fit(self, X, y, sample_weight=None, check_input=True):
# on the test below we force this to false, we make sure this is
# actually passed to the regressor
assert not check_input
return super().fit(X, y, sample_weight)
def test_transform_target_regressor_pass_fit_parameters():
X, y = friedman
regr = TransformedTargetRegressor(
regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()
)
regr.fit(X, y, check_input=False)
assert regr.transformer_.fit_counter == 1
def test_transform_target_regressor_route_pipeline():
X, y = friedman
regr = TransformedTargetRegressor(
regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer()
)
estimators = [("normalize", StandardScaler()), ("est", regr)]
pip = Pipeline(estimators)
pip.fit(X, y, **{"est__check_input": False})
assert regr.transformer_.fit_counter == 1
class DummyRegressorWithExtraPredictParams(DummyRegressor):
def predict(self, X, check_input=True):
# In the test below we make sure that the check input parameter is
# passed as false
self.predict_called = True
assert not check_input
return super().predict(X)
def test_transform_target_regressor_pass_extra_predict_parameters():
# Checks that predict kwargs are passed to regressor.
X, y = friedman
regr = TransformedTargetRegressor(
regressor=DummyRegressorWithExtraPredictParams(), transformer=DummyTransformer()
)
regr.fit(X, y)
regr.predict(X, check_input=False)
assert regr.regressor_.predict_called
@@ -0,0 +1,315 @@
import builtins
import platform
import sys
from contextlib import suppress
from functools import wraps
from os import environ
from unittest import SkipTest
import joblib
import numpy as np
import pytest
from _pytest.doctest import DoctestItem
from threadpoolctl import threadpool_limits
from sklearn import config_context, set_config
from sklearn._min_dependencies import PYTEST_MIN_VERSION
from sklearn.datasets import (
fetch_20newsgroups,
fetch_20newsgroups_vectorized,
fetch_california_housing,
fetch_covtype,
fetch_kddcup99,
fetch_lfw_pairs,
fetch_lfw_people,
fetch_olivetti_faces,
fetch_rcv1,
fetch_species_distributions,
)
from sklearn.tests import random_seed
from sklearn.utils._testing import get_pytest_filterwarning_lines
from sklearn.utils.fixes import (
_IS_32BIT,
np_base_version,
parse_version,
sp_version,
)
if parse_version(pytest.__version__) < parse_version(PYTEST_MIN_VERSION):
raise ImportError(
f"Your version of pytest is too old. Got version {pytest.__version__}, you"
f" should have pytest >= {PYTEST_MIN_VERSION} installed."
)
scipy_datasets_require_network = sp_version >= parse_version("1.10")
@pytest.fixture
def enable_slep006():
"""Enable SLEP006 for all tests."""
with config_context(enable_metadata_routing=True):
yield
def raccoon_face_or_skip():
# SciPy >= 1.10 requires network to access to get data
if scipy_datasets_require_network:
run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
if not run_network_tests:
raise SkipTest("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
try:
import pooch # noqa
except ImportError:
raise SkipTest("test requires pooch to be installed")
from scipy.datasets import face
else:
from scipy.misc import face
return face(gray=True)
dataset_fetchers = {
"fetch_20newsgroups_fxt": fetch_20newsgroups,
"fetch_20newsgroups_vectorized_fxt": fetch_20newsgroups_vectorized,
"fetch_california_housing_fxt": fetch_california_housing,
"fetch_covtype_fxt": fetch_covtype,
"fetch_kddcup99_fxt": fetch_kddcup99,
"fetch_lfw_pairs_fxt": fetch_lfw_pairs,
"fetch_lfw_people_fxt": fetch_lfw_people,
"fetch_olivetti_faces_fxt": fetch_olivetti_faces,
"fetch_rcv1_fxt": fetch_rcv1,
"fetch_species_distributions_fxt": fetch_species_distributions,
}
if scipy_datasets_require_network:
dataset_fetchers["raccoon_face_fxt"] = raccoon_face_or_skip
_SKIP32_MARK = pytest.mark.skipif(
environ.get("SKLEARN_RUN_FLOAT32_TESTS", "0") != "1",
reason="Set SKLEARN_RUN_FLOAT32_TESTS=1 to run float32 dtype tests",
)
# Global fixtures
@pytest.fixture(params=[pytest.param(np.float32, marks=_SKIP32_MARK), np.float64])
def global_dtype(request):
yield request.param
def _fetch_fixture(f):
"""Fetch dataset (download if missing and requested by environment)."""
download_if_missing = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
@wraps(f)
def wrapped(*args, **kwargs):
kwargs["download_if_missing"] = download_if_missing
try:
return f(*args, **kwargs)
except OSError as e:
if str(e) != "Data not found and `download_if_missing` is False":
raise
pytest.skip("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
return pytest.fixture(lambda: wrapped)
# Adds fixtures for fetching data
fetch_20newsgroups_fxt = _fetch_fixture(fetch_20newsgroups)
fetch_20newsgroups_vectorized_fxt = _fetch_fixture(fetch_20newsgroups_vectorized)
fetch_california_housing_fxt = _fetch_fixture(fetch_california_housing)
fetch_covtype_fxt = _fetch_fixture(fetch_covtype)
fetch_kddcup99_fxt = _fetch_fixture(fetch_kddcup99)
fetch_lfw_pairs_fxt = _fetch_fixture(fetch_lfw_pairs)
fetch_lfw_people_fxt = _fetch_fixture(fetch_lfw_people)
fetch_olivetti_faces_fxt = _fetch_fixture(fetch_olivetti_faces)
fetch_rcv1_fxt = _fetch_fixture(fetch_rcv1)
fetch_species_distributions_fxt = _fetch_fixture(fetch_species_distributions)
raccoon_face_fxt = pytest.fixture(raccoon_face_or_skip)
def pytest_collection_modifyitems(config, items):
"""Called after collect is completed.
Parameters
----------
config : pytest config
items : list of collected items
"""
run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0"
skip_network = pytest.mark.skip(
reason="test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0"
)
# download datasets during collection to avoid thread unsafe behavior
# when running pytest in parallel with pytest-xdist
dataset_features_set = set(dataset_fetchers)
datasets_to_download = set()
for item in items:
if isinstance(item, DoctestItem) and "fetch_" in item.name:
fetcher_function_name = item.name.split(".")[-1]
dataset_fetchers_key = f"{fetcher_function_name}_fxt"
dataset_to_fetch = set([dataset_fetchers_key]) & dataset_features_set
elif not hasattr(item, "fixturenames"):
continue
else:
item_fixtures = set(item.fixturenames)
dataset_to_fetch = item_fixtures & dataset_features_set
if not dataset_to_fetch:
continue
if run_network_tests:
datasets_to_download |= dataset_to_fetch
else:
# network tests are skipped
item.add_marker(skip_network)
# Only download datasets on the first worker spawned by pytest-xdist
# to avoid thread unsafe behavior. If pytest-xdist is not used, we still
# download before tests run.
worker_id = environ.get("PYTEST_XDIST_WORKER", "gw0")
if worker_id == "gw0" and run_network_tests:
for name in datasets_to_download:
with suppress(SkipTest):
dataset_fetchers[name]()
for item in items:
# Known failure on with GradientBoostingClassifier on ARM64
if (
item.name.endswith("GradientBoostingClassifier")
and platform.machine() == "aarch64"
):
marker = pytest.mark.xfail(
reason=(
"know failure. See "
"https://github.com/scikit-learn/scikit-learn/issues/17797" # noqa
)
)
item.add_marker(marker)
skip_doctests = False
try:
import matplotlib # noqa
except ImportError:
skip_doctests = True
reason = "matplotlib is required to run the doctests"
if _IS_32BIT:
reason = "doctest are only run when the default numpy int is 64 bits."
skip_doctests = True
elif sys.platform.startswith("win32"):
reason = (
"doctests are not run for Windows because numpy arrays "
"repr is inconsistent across platforms."
)
skip_doctests = True
if np_base_version >= parse_version("2"):
reason = "Due to NEP 51 numpy scalar repr has changed in numpy 2"
skip_doctests = True
# Normally doctest has the entire module's scope. Here we set globs to an empty dict
# to remove the module's scope:
# https://docs.python.org/3/library/doctest.html#what-s-the-execution-context
for item in items:
if isinstance(item, DoctestItem):
item.dtest.globs = {}
if skip_doctests:
skip_marker = pytest.mark.skip(reason=reason)
for item in items:
if isinstance(item, DoctestItem):
# work-around an internal error with pytest if adding a skip
# mark to a doctest in a contextmanager, see
# https://github.com/pytest-dev/pytest/issues/8796 for more
# details.
if item.name != "sklearn._config.config_context":
item.add_marker(skip_marker)
try:
import PIL # noqa
pillow_installed = True
except ImportError:
pillow_installed = False
if not pillow_installed:
skip_marker = pytest.mark.skip(reason="pillow (or PIL) not installed!")
for item in items:
if item.name in [
"sklearn.feature_extraction.image.PatchExtractor",
"sklearn.feature_extraction.image.extract_patches_2d",
]:
item.add_marker(skip_marker)
@pytest.fixture(scope="function")
def pyplot():
"""Setup and teardown fixture for matplotlib.
This fixture checks if we can import matplotlib. If not, the tests will be
skipped. Otherwise, we close the figures before and after running the
functions.
Returns
-------
pyplot : module
The ``matplotlib.pyplot`` module.
"""
pyplot = pytest.importorskip("matplotlib.pyplot")
pyplot.close("all")
yield pyplot
pyplot.close("all")
def pytest_configure(config):
# Use matplotlib agg backend during the tests including doctests
try:
import matplotlib
matplotlib.use("agg")
except ImportError:
pass
allowed_parallelism = joblib.cpu_count(only_physical_cores=True)
xdist_worker_count = environ.get("PYTEST_XDIST_WORKER_COUNT")
if xdist_worker_count is not None:
# Set the number of OpenMP and BLAS threads based on the number of workers
# xdist is using to prevent oversubscription.
allowed_parallelism = max(allowed_parallelism // int(xdist_worker_count), 1)
threadpool_limits(allowed_parallelism)
# Register global_random_seed plugin if it is not already registered
if not config.pluginmanager.hasplugin("sklearn.tests.random_seed"):
config.pluginmanager.register(random_seed)
if environ.get("SKLEARN_WARNINGS_AS_ERRORS", "0") != "0":
# This seems like the only way to programmatically change the config
# filterwarnings. This was suggested in
# https://github.com/pytest-dev/pytest/issues/3311#issuecomment-373177592
for line in get_pytest_filterwarning_lines():
config.addinivalue_line("filterwarnings", line)
@pytest.fixture
def hide_available_pandas(monkeypatch):
"""Pretend pandas was not installed."""
import_orig = builtins.__import__
def mocked_import(name, *args, **kwargs):
if name == "pandas":
raise ImportError()
return import_orig(name, *args, **kwargs)
monkeypatch.setattr(builtins, "__import__", mocked_import)
@pytest.fixture
def print_changed_only_false():
"""Set `print_changed_only` to False for the duration of the test."""
set_config(print_changed_only=False)
yield
set_config(print_changed_only=True) # reset to default
@@ -0,0 +1,44 @@
"""
The :mod:`sklearn.covariance` module includes methods and algorithms to
robustly estimate the covariance of features given a set of points. The
precision matrix defined as the inverse of the covariance is also estimated.
Covariance estimation is closely related to the theory of Gaussian Graphical
Models.
"""
from ._elliptic_envelope import EllipticEnvelope
from ._empirical_covariance import (
EmpiricalCovariance,
empirical_covariance,
log_likelihood,
)
from ._graph_lasso import GraphicalLasso, GraphicalLassoCV, graphical_lasso
from ._robust_covariance import MinCovDet, fast_mcd
from ._shrunk_covariance import (
OAS,
LedoitWolf,
ShrunkCovariance,
ledoit_wolf,
ledoit_wolf_shrinkage,
oas,
shrunk_covariance,
)
__all__ = [
"EllipticEnvelope",
"EmpiricalCovariance",
"GraphicalLasso",
"GraphicalLassoCV",
"LedoitWolf",
"MinCovDet",
"OAS",
"ShrunkCovariance",
"empirical_covariance",
"fast_mcd",
"graphical_lasso",
"ledoit_wolf",
"ledoit_wolf_shrinkage",
"log_likelihood",
"oas",
"shrunk_covariance",
]
@@ -0,0 +1,267 @@
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
from numbers import Real
import numpy as np
from ..base import OutlierMixin, _fit_context
from ..metrics import accuracy_score
from ..utils._param_validation import Interval
from ..utils.validation import check_is_fitted
from ._robust_covariance import MinCovDet
class EllipticEnvelope(OutlierMixin, MinCovDet):
"""An object for detecting outliers in a Gaussian distributed dataset.
Read more in the :ref:`User Guide <outlier_detection>`.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored.
assume_centered : bool, default=False
If True, the support of robust location and covariance estimates
is computed, and a covariance estimate is recomputed from it,
without centering the data.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, the robust location and covariance are directly computed
with the FastMCD algorithm without additional treatment.
support_fraction : float, default=None
The proportion of points to be included in the support of the raw
MCD estimate. If None, the minimum value of support_fraction will
be used within the algorithm: `(n_samples + n_features + 1) / 2 * n_samples`.
Range is (0, 1).
contamination : float, default=0.1
The amount of contamination of the data set, i.e. the proportion
of outliers in the data set. Range is (0, 0.5].
random_state : int, RandomState instance or None, default=None
Determines the pseudo random number generator for shuffling
the data. Pass an int for reproducible results across multiple function
calls. See :term:`Glossary <random_state>`.
Attributes
----------
location_ : ndarray of shape (n_features,)
Estimated robust location.
covariance_ : ndarray of shape (n_features, n_features)
Estimated robust covariance matrix.
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
support_ : ndarray of shape (n_samples,)
A mask of the observations that have been used to compute the
robust estimates of location and shape.
offset_ : float
Offset used to define the decision function from the raw scores.
We have the relation: ``decision_function = score_samples - offset_``.
The offset depends on the contamination parameter and is defined in
such a way we obtain the expected number of outliers (samples with
decision function < 0) in training.
.. versionadded:: 0.20
raw_location_ : ndarray of shape (n_features,)
The raw robust estimated location before correction and re-weighting.
raw_covariance_ : ndarray of shape (n_features, n_features)
The raw robust estimated covariance before correction and re-weighting.
raw_support_ : ndarray of shape (n_samples,)
A mask of the observations that have been used to compute
the raw robust estimates of location and shape, before correction
and re-weighting.
dist_ : ndarray of shape (n_samples,)
Mahalanobis distances of the training set (on which :meth:`fit` is
called) observations.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
EmpiricalCovariance : Maximum likelihood covariance estimator.
GraphicalLasso : Sparse inverse covariance estimation
with an l1-penalized estimator.
LedoitWolf : LedoitWolf Estimator.
MinCovDet : Minimum Covariance Determinant
(robust estimator of covariance).
OAS : Oracle Approximating Shrinkage Estimator.
ShrunkCovariance : Covariance estimator with shrinkage.
Notes
-----
Outlier detection from covariance estimation may break or not
perform well in high-dimensional settings. In particular, one will
always take care to work with ``n_samples > n_features ** 2``.
References
----------
.. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the
minimum covariance determinant estimator" Technometrics 41(3), 212
(1999)
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import EllipticEnvelope
>>> true_cov = np.array([[.8, .3],
... [.3, .4]])
>>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0],
... cov=true_cov,
... size=500)
>>> cov = EllipticEnvelope(random_state=0).fit(X)
>>> # predict returns 1 for an inlier and -1 for an outlier
>>> cov.predict([[0, 0],
... [3, 3]])
array([ 1, -1])
>>> cov.covariance_
array([[0.7411..., 0.2535...],
[0.2535..., 0.3053...]])
>>> cov.location_
array([0.0813... , 0.0427...])
"""
_parameter_constraints: dict = {
**MinCovDet._parameter_constraints,
"contamination": [Interval(Real, 0, 0.5, closed="right")],
}
def __init__(
self,
*,
store_precision=True,
assume_centered=False,
support_fraction=None,
contamination=0.1,
random_state=None,
):
super().__init__(
store_precision=store_precision,
assume_centered=assume_centered,
support_fraction=support_fraction,
random_state=random_state,
)
self.contamination = contamination
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the EllipticEnvelope model.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
super().fit(X)
self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination)
return self
def decision_function(self, X):
"""Compute the decision function of the given observations.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
Returns
-------
decision : ndarray of shape (n_samples,)
Decision function of the samples.
It is equal to the shifted Mahalanobis distances.
The threshold for being an outlier is 0, which ensures a
compatibility with other outlier detection algorithms.
"""
check_is_fitted(self)
negative_mahal_dist = self.score_samples(X)
return negative_mahal_dist - self.offset_
def score_samples(self, X):
"""Compute the negative Mahalanobis distances.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
Returns
-------
negative_mahal_distances : array-like of shape (n_samples,)
Opposite of the Mahalanobis distances.
"""
check_is_fitted(self)
return -self.mahalanobis(X)
def predict(self, X):
"""
Predict labels (1 inlier, -1 outlier) of X according to fitted model.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
Returns
-------
is_inlier : ndarray of shape (n_samples,)
Returns -1 for anomalies/outliers and +1 for inliers.
"""
values = self.decision_function(X)
is_inlier = np.full(values.shape[0], -1, dtype=int)
is_inlier[values >= 0] = 1
return is_inlier
def score(self, X, y, sample_weight=None):
"""Return the mean accuracy on the given test data and labels.
In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test samples.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
True labels for X.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
score : float
Mean accuracy of self.predict(X) w.r.t. y.
"""
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
@@ -0,0 +1,364 @@
"""
Maximum likelihood covariance estimator.
"""
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
# avoid division truncation
import warnings
import numpy as np
from scipy import linalg
from .. import config_context
from ..base import BaseEstimator, _fit_context
from ..metrics.pairwise import pairwise_distances
from ..utils import check_array
from ..utils._param_validation import validate_params
from ..utils.extmath import fast_logdet
@validate_params(
{
"emp_cov": [np.ndarray],
"precision": [np.ndarray],
},
prefer_skip_nested_validation=True,
)
def log_likelihood(emp_cov, precision):
"""Compute the sample mean of the log_likelihood under a covariance model.
Computes the empirical expected log-likelihood, allowing for universal
comparison (beyond this software package), and accounts for normalization
terms and scaling.
Parameters
----------
emp_cov : ndarray of shape (n_features, n_features)
Maximum Likelihood Estimator of covariance.
precision : ndarray of shape (n_features, n_features)
The precision matrix of the covariance model to be tested.
Returns
-------
log_likelihood_ : float
Sample mean of the log-likelihood.
"""
p = precision.shape[0]
log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision)
log_likelihood_ -= p * np.log(2 * np.pi)
log_likelihood_ /= 2.0
return log_likelihood_
@validate_params(
{
"X": ["array-like"],
"assume_centered": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def empirical_covariance(X, *, assume_centered=False):
"""Compute the Maximum likelihood covariance estimator.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
Data from which to compute the covariance estimate.
assume_centered : bool, default=False
If `True`, data will not be centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If `False`, data will be centered before computation.
Returns
-------
covariance : ndarray of shape (n_features, n_features)
Empirical covariance (Maximum Likelihood Estimator).
Examples
--------
>>> from sklearn.covariance import empirical_covariance
>>> X = [[1,1,1],[1,1,1],[1,1,1],
... [0,0,0],[0,0,0],[0,0,0]]
>>> empirical_covariance(X)
array([[0.25, 0.25, 0.25],
[0.25, 0.25, 0.25],
[0.25, 0.25, 0.25]])
"""
X = check_array(X, ensure_2d=False, force_all_finite=False)
if X.ndim == 1:
X = np.reshape(X, (1, -1))
if X.shape[0] == 1:
warnings.warn(
"Only one sample available. You may want to reshape your data array"
)
if assume_centered:
covariance = np.dot(X.T, X) / X.shape[0]
else:
covariance = np.cov(X.T, bias=1)
if covariance.ndim == 0:
covariance = np.array([[covariance]])
return covariance
class EmpiricalCovariance(BaseEstimator):
"""Maximum likelihood covariance estimator.
Read more in the :ref:`User Guide <covariance>`.
Parameters
----------
store_precision : bool, default=True
Specifies if the estimated precision is stored.
assume_centered : bool, default=False
If True, data are not centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False (default), data are centered before computation.
Attributes
----------
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo-inverse matrix.
(stored only if store_precision is True)
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
EllipticEnvelope : An object for detecting outliers in
a Gaussian distributed dataset.
GraphicalLasso : Sparse inverse covariance estimation
with an l1-penalized estimator.
LedoitWolf : LedoitWolf Estimator.
MinCovDet : Minimum Covariance Determinant
(robust estimator of covariance).
OAS : Oracle Approximating Shrinkage Estimator.
ShrunkCovariance : Covariance estimator with shrinkage.
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import EmpiricalCovariance
>>> from sklearn.datasets import make_gaussian_quantiles
>>> real_cov = np.array([[.8, .3],
... [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=500)
>>> cov = EmpiricalCovariance().fit(X)
>>> cov.covariance_
array([[0.7569..., 0.2818...],
[0.2818..., 0.3928...]])
>>> cov.location_
array([0.0622..., 0.0193...])
"""
_parameter_constraints: dict = {
"store_precision": ["boolean"],
"assume_centered": ["boolean"],
}
def __init__(self, *, store_precision=True, assume_centered=False):
self.store_precision = store_precision
self.assume_centered = assume_centered
def _set_covariance(self, covariance):
"""Saves the covariance and precision estimates
Storage is done accordingly to `self.store_precision`.
Precision stored only if invertible.
Parameters
----------
covariance : array-like of shape (n_features, n_features)
Estimated covariance matrix to be stored, and from which precision
is computed.
"""
covariance = check_array(covariance)
# set covariance
self.covariance_ = covariance
# set precision
if self.store_precision:
self.precision_ = linalg.pinvh(covariance, check_finite=False)
else:
self.precision_ = None
def get_precision(self):
"""Getter for the precision matrix.
Returns
-------
precision_ : array-like of shape (n_features, n_features)
The precision matrix associated to the current covariance object.
"""
if self.store_precision:
precision = self.precision_
else:
precision = linalg.pinvh(self.covariance_, check_finite=False)
return precision
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the maximum likelihood covariance estimator to X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(X)
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
covariance = empirical_covariance(X, assume_centered=self.assume_centered)
self._set_covariance(covariance)
return self
def score(self, X_test, y=None):
"""Compute the log-likelihood of `X_test` under the estimated Gaussian model.
The Gaussian model is defined by its mean and covariance matrix which are
represented respectively by `self.location_` and `self.covariance_`.
Parameters
----------
X_test : array-like of shape (n_samples, n_features)
Test data of which we compute the likelihood, where `n_samples` is
the number of samples and `n_features` is the number of features.
`X_test` is assumed to be drawn from the same distribution than
the data used in fit (including centering).
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
res : float
The log-likelihood of `X_test` with `self.location_` and `self.covariance_`
as estimators of the Gaussian model mean and covariance matrix respectively.
"""
X_test = self._validate_data(X_test, reset=False)
# compute empirical covariance of the test set
test_cov = empirical_covariance(X_test - self.location_, assume_centered=True)
# compute log likelihood
res = log_likelihood(test_cov, self.get_precision())
return res
def error_norm(self, comp_cov, norm="frobenius", scaling=True, squared=True):
"""Compute the Mean Squared Error between two covariance estimators.
Parameters
----------
comp_cov : array-like of shape (n_features, n_features)
The covariance to compare with.
norm : {"frobenius", "spectral"}, default="frobenius"
The type of norm used to compute the error. Available error types:
- 'frobenius' (default): sqrt(tr(A^t.A))
- 'spectral': sqrt(max(eigenvalues(A^t.A))
where A is the error ``(comp_cov - self.covariance_)``.
scaling : bool, default=True
If True (default), the squared error norm is divided by n_features.
If False, the squared error norm is not rescaled.
squared : bool, default=True
Whether to compute the squared error norm or the error norm.
If True (default), the squared error norm is returned.
If False, the error norm is returned.
Returns
-------
result : float
The Mean Squared Error (in the sense of the Frobenius norm) between
`self` and `comp_cov` covariance estimators.
"""
# compute the error
error = comp_cov - self.covariance_
# compute the error norm
if norm == "frobenius":
squared_norm = np.sum(error**2)
elif norm == "spectral":
squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))
else:
raise NotImplementedError(
"Only spectral and frobenius norms are implemented"
)
# optionally scale the error norm
if scaling:
squared_norm = squared_norm / error.shape[0]
# finally get either the squared norm or the norm
if squared:
result = squared_norm
else:
result = np.sqrt(squared_norm)
return result
def mahalanobis(self, X):
"""Compute the squared Mahalanobis distances of given observations.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The observations, the Mahalanobis distances of the which we
compute. Observations are assumed to be drawn from the same
distribution than the data used in fit.
Returns
-------
dist : ndarray of shape (n_samples,)
Squared Mahalanobis distances of the observations.
"""
X = self._validate_data(X, reset=False)
precision = self.get_precision()
with config_context(assume_finite=True):
# compute mahalanobis distances
dist = pairwise_distances(
X, self.location_[np.newaxis, :], metric="mahalanobis", VI=precision
)
return np.reshape(dist, (len(X),)) ** 2
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,869 @@
"""
Robust location and covariance estimators.
Here are implemented estimators that are resistant to outliers.
"""
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
import warnings
from numbers import Integral, Real
import numpy as np
from scipy import linalg
from scipy.stats import chi2
from ..base import _fit_context
from ..utils import check_array, check_random_state
from ..utils._param_validation import Interval
from ..utils.extmath import fast_logdet
from ._empirical_covariance import EmpiricalCovariance, empirical_covariance
# Minimum Covariance Determinant
# Implementing of an algorithm by Rousseeuw & Van Driessen described in
# (A Fast Algorithm for the Minimum Covariance Determinant Estimator,
# 1999, American Statistical Association and the American Society
# for Quality, TECHNOMETRICS)
# XXX Is this really a public function? It's not listed in the docs or
# exported by sklearn.covariance. Deprecate?
def c_step(
X,
n_support,
remaining_iterations=30,
initial_estimates=None,
verbose=False,
cov_computation_method=empirical_covariance,
random_state=None,
):
"""C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data set in which we look for the n_support observations whose
scatter matrix has minimum determinant.
n_support : int
Number of observations to compute the robust estimates of location
and covariance from. This parameter must be greater than
`n_samples / 2`.
remaining_iterations : int, default=30
Number of iterations to perform.
According to [Rouseeuw1999]_, two iterations are sufficient to get
close to the minimum, and we never need more than 30 to reach
convergence.
initial_estimates : tuple of shape (2,), default=None
Initial estimates of location and shape from which to run the c_step
procedure:
- initial_estimates[0]: an initial location estimate
- initial_estimates[1]: an initial covariance estimate
verbose : bool, default=False
Verbose mode.
cov_computation_method : callable, \
default=:func:`sklearn.covariance.empirical_covariance`
The function which will be used to compute the covariance.
Must return array of shape (n_features, n_features).
random_state : int, RandomState instance or None, default=None
Determines the pseudo random number generator for shuffling the data.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
Returns
-------
location : ndarray of shape (n_features,)
Robust location estimates.
covariance : ndarray of shape (n_features, n_features)
Robust covariance estimates.
support : ndarray of shape (n_samples,)
A mask for the `n_support` observations whose scatter matrix has
minimum determinant.
References
----------
.. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant
Estimator, 1999, American Statistical Association and the American
Society for Quality, TECHNOMETRICS
"""
X = np.asarray(X)
random_state = check_random_state(random_state)
return _c_step(
X,
n_support,
remaining_iterations=remaining_iterations,
initial_estimates=initial_estimates,
verbose=verbose,
cov_computation_method=cov_computation_method,
random_state=random_state,
)
def _c_step(
X,
n_support,
random_state,
remaining_iterations=30,
initial_estimates=None,
verbose=False,
cov_computation_method=empirical_covariance,
):
n_samples, n_features = X.shape
dist = np.inf
# Initialisation
support = np.zeros(n_samples, dtype=bool)
if initial_estimates is None:
# compute initial robust estimates from a random subset
support[random_state.permutation(n_samples)[:n_support]] = True
else:
# get initial robust estimates from the function parameters
location = initial_estimates[0]
covariance = initial_estimates[1]
# run a special iteration for that case (to get an initial support)
precision = linalg.pinvh(covariance)
X_centered = X - location
dist = (np.dot(X_centered, precision) * X_centered).sum(1)
# compute new estimates
support[np.argsort(dist)[:n_support]] = True
X_support = X[support]
location = X_support.mean(0)
covariance = cov_computation_method(X_support)
# Iterative procedure for Minimum Covariance Determinant computation
det = fast_logdet(covariance)
# If the data already has singular covariance, calculate the precision,
# as the loop below will not be entered.
if np.isinf(det):
precision = linalg.pinvh(covariance)
previous_det = np.inf
while det < previous_det and remaining_iterations > 0 and not np.isinf(det):
# save old estimates values
previous_location = location
previous_covariance = covariance
previous_det = det
previous_support = support
# compute a new support from the full data set mahalanobis distances
precision = linalg.pinvh(covariance)
X_centered = X - location
dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1)
# compute new estimates
support = np.zeros(n_samples, dtype=bool)
support[np.argsort(dist)[:n_support]] = True
X_support = X[support]
location = X_support.mean(axis=0)
covariance = cov_computation_method(X_support)
det = fast_logdet(covariance)
# update remaining iterations for early stopping
remaining_iterations -= 1
previous_dist = dist
dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1)
# Check if best fit already found (det => 0, logdet => -inf)
if np.isinf(det):
results = location, covariance, det, support, dist
# Check convergence
if np.allclose(det, previous_det):
# c_step procedure converged
if verbose:
print(
"Optimal couple (location, covariance) found before"
" ending iterations (%d left)" % (remaining_iterations)
)
results = location, covariance, det, support, dist
elif det > previous_det:
# determinant has increased (should not happen)
warnings.warn(
"Determinant has increased; this should not happen: "
"log(det) > log(previous_det) (%.15f > %.15f). "
"You may want to try with a higher value of "
"support_fraction (current value: %.3f)."
% (det, previous_det, n_support / n_samples),
RuntimeWarning,
)
results = (
previous_location,
previous_covariance,
previous_det,
previous_support,
previous_dist,
)
# Check early stopping
if remaining_iterations == 0:
if verbose:
print("Maximum number of iterations reached")
results = location, covariance, det, support, dist
return results
def select_candidates(
X,
n_support,
n_trials,
select=1,
n_iter=30,
verbose=False,
cov_computation_method=empirical_covariance,
random_state=None,
):
"""Finds the best pure subset of observations to compute MCD from it.
The purpose of this function is to find the best sets of n_support
observations with respect to a minimization of their covariance
matrix determinant. Equivalently, it removes n_samples-n_support
observations to construct what we call a pure data set (i.e. not
containing outliers). The list of the observations of the pure
data set is referred to as the `support`.
Starting from a random support, the pure data set is found by the
c_step procedure introduced by Rousseeuw and Van Driessen in
[RV]_.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data (sub)set in which we look for the n_support purest observations.
n_support : int
The number of samples the pure data set must contain.
This parameter must be in the range `[(n + p + 1)/2] < n_support < n`.
n_trials : int or tuple of shape (2,)
Number of different initial sets of observations from which to
run the algorithm. This parameter should be a strictly positive
integer.
Instead of giving a number of trials to perform, one can provide a
list of initial estimates that will be used to iteratively run
c_step procedures. In this case:
- n_trials[0]: array-like, shape (n_trials, n_features)
is the list of `n_trials` initial location estimates
- n_trials[1]: array-like, shape (n_trials, n_features, n_features)
is the list of `n_trials` initial covariances estimates
select : int, default=1
Number of best candidates results to return. This parameter must be
a strictly positive integer.
n_iter : int, default=30
Maximum number of iterations for the c_step procedure.
(2 is enough to be close to the final solution. "Never" exceeds 20).
This parameter must be a strictly positive integer.
verbose : bool, default=False
Control the output verbosity.
cov_computation_method : callable, \
default=:func:`sklearn.covariance.empirical_covariance`
The function which will be used to compute the covariance.
Must return an array of shape (n_features, n_features).
random_state : int, RandomState instance or None, default=None
Determines the pseudo random number generator for shuffling the data.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
See Also
---------
c_step
Returns
-------
best_locations : ndarray of shape (select, n_features)
The `select` location estimates computed from the `select` best
supports found in the data set (`X`).
best_covariances : ndarray of shape (select, n_features, n_features)
The `select` covariance estimates computed from the `select`
best supports found in the data set (`X`).
best_supports : ndarray of shape (select, n_samples)
The `select` best supports found in the data set (`X`).
References
----------
.. [RV] A Fast Algorithm for the Minimum Covariance Determinant
Estimator, 1999, American Statistical Association and the American
Society for Quality, TECHNOMETRICS
"""
random_state = check_random_state(random_state)
if isinstance(n_trials, Integral):
run_from_estimates = False
elif isinstance(n_trials, tuple):
run_from_estimates = True
estimates_list = n_trials
n_trials = estimates_list[0].shape[0]
else:
raise TypeError(
"Invalid 'n_trials' parameter, expected tuple or integer, got %s (%s)"
% (n_trials, type(n_trials))
)
# compute `n_trials` location and shape estimates candidates in the subset
all_estimates = []
if not run_from_estimates:
# perform `n_trials` computations from random initial supports
for j in range(n_trials):
all_estimates.append(
_c_step(
X,
n_support,
remaining_iterations=n_iter,
verbose=verbose,
cov_computation_method=cov_computation_method,
random_state=random_state,
)
)
else:
# perform computations from every given initial estimates
for j in range(n_trials):
initial_estimates = (estimates_list[0][j], estimates_list[1][j])
all_estimates.append(
_c_step(
X,
n_support,
remaining_iterations=n_iter,
initial_estimates=initial_estimates,
verbose=verbose,
cov_computation_method=cov_computation_method,
random_state=random_state,
)
)
all_locs_sub, all_covs_sub, all_dets_sub, all_supports_sub, all_ds_sub = zip(
*all_estimates
)
# find the `n_best` best results among the `n_trials` ones
index_best = np.argsort(all_dets_sub)[:select]
best_locations = np.asarray(all_locs_sub)[index_best]
best_covariances = np.asarray(all_covs_sub)[index_best]
best_supports = np.asarray(all_supports_sub)[index_best]
best_ds = np.asarray(all_ds_sub)[index_best]
return best_locations, best_covariances, best_supports, best_ds
def fast_mcd(
X,
support_fraction=None,
cov_computation_method=empirical_covariance,
random_state=None,
):
"""Estimate the Minimum Covariance Determinant matrix.
Read more in the :ref:`User Guide <robust_covariance>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix, with p features and n samples.
support_fraction : float, default=None
The proportion of points to be included in the support of the raw
MCD estimate. Default is `None`, which implies that the minimum
value of `support_fraction` will be used within the algorithm:
`(n_samples + n_features + 1) / 2 * n_samples`. This parameter must be
in the range (0, 1).
cov_computation_method : callable, \
default=:func:`sklearn.covariance.empirical_covariance`
The function which will be used to compute the covariance.
Must return an array of shape (n_features, n_features).
random_state : int, RandomState instance or None, default=None
Determines the pseudo random number generator for shuffling the data.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
Returns
-------
location : ndarray of shape (n_features,)
Robust location of the data.
covariance : ndarray of shape (n_features, n_features)
Robust covariance of the features.
support : ndarray of shape (n_samples,), dtype=bool
A mask of the observations that have been used to compute
the robust location and covariance estimates of the data set.
Notes
-----
The FastMCD algorithm has been introduced by Rousseuw and Van Driessen
in "A Fast Algorithm for the Minimum Covariance Determinant Estimator,
1999, American Statistical Association and the American Society
for Quality, TECHNOMETRICS".
The principle is to compute robust estimates and random subsets before
pooling them into a larger subsets, and finally into the full data set.
Depending on the size of the initial sample, we have one, two or three
such computation levels.
Note that only raw estimates are returned. If one is interested in
the correction and reweighting steps described in [RouseeuwVan]_,
see the MinCovDet object.
References
----------
.. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance
Determinant Estimator, 1999, American Statistical Association
and the American Society for Quality, TECHNOMETRICS
.. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun,
Asymptotics For The Minimum Covariance Determinant Estimator,
The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
"""
random_state = check_random_state(random_state)
X = check_array(X, ensure_min_samples=2, estimator="fast_mcd")
n_samples, n_features = X.shape
# minimum breakdown value
if support_fraction is None:
n_support = int(np.ceil(0.5 * (n_samples + n_features + 1)))
else:
n_support = int(support_fraction * n_samples)
# 1-dimensional case quick computation
# (Rousseeuw, P. J. and Leroy, A. M. (2005) References, in Robust
# Regression and Outlier Detection, John Wiley & Sons, chapter 4)
if n_features == 1:
if n_support < n_samples:
# find the sample shortest halves
X_sorted = np.sort(np.ravel(X))
diff = X_sorted[n_support:] - X_sorted[: (n_samples - n_support)]
halves_start = np.where(diff == np.min(diff))[0]
# take the middle points' mean to get the robust location estimate
location = (
0.5
* (X_sorted[n_support + halves_start] + X_sorted[halves_start]).mean()
)
support = np.zeros(n_samples, dtype=bool)
X_centered = X - location
support[np.argsort(np.abs(X_centered), 0)[:n_support]] = True
covariance = np.asarray([[np.var(X[support])]])
location = np.array([location])
# get precision matrix in an optimized way
precision = linalg.pinvh(covariance)
dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
else:
support = np.ones(n_samples, dtype=bool)
covariance = np.asarray([[np.var(X)]])
location = np.asarray([np.mean(X)])
X_centered = X - location
# get precision matrix in an optimized way
precision = linalg.pinvh(covariance)
dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
# Starting FastMCD algorithm for p-dimensional case
if (n_samples > 500) and (n_features > 1):
# 1. Find candidate supports on subsets
# a. split the set in subsets of size ~ 300
n_subsets = n_samples // 300
n_samples_subsets = n_samples // n_subsets
samples_shuffle = random_state.permutation(n_samples)
h_subset = int(np.ceil(n_samples_subsets * (n_support / float(n_samples))))
# b. perform a total of 500 trials
n_trials_tot = 500
# c. select 10 best (location, covariance) for each subset
n_best_sub = 10
n_trials = max(10, n_trials_tot // n_subsets)
n_best_tot = n_subsets * n_best_sub
all_best_locations = np.zeros((n_best_tot, n_features))
try:
all_best_covariances = np.zeros((n_best_tot, n_features, n_features))
except MemoryError:
# The above is too big. Let's try with something much small
# (and less optimal)
n_best_tot = 10
all_best_covariances = np.zeros((n_best_tot, n_features, n_features))
n_best_sub = 2
for i in range(n_subsets):
low_bound = i * n_samples_subsets
high_bound = low_bound + n_samples_subsets
current_subset = X[samples_shuffle[low_bound:high_bound]]
best_locations_sub, best_covariances_sub, _, _ = select_candidates(
current_subset,
h_subset,
n_trials,
select=n_best_sub,
n_iter=2,
cov_computation_method=cov_computation_method,
random_state=random_state,
)
subset_slice = np.arange(i * n_best_sub, (i + 1) * n_best_sub)
all_best_locations[subset_slice] = best_locations_sub
all_best_covariances[subset_slice] = best_covariances_sub
# 2. Pool the candidate supports into a merged set
# (possibly the full dataset)
n_samples_merged = min(1500, n_samples)
h_merged = int(np.ceil(n_samples_merged * (n_support / float(n_samples))))
if n_samples > 1500:
n_best_merged = 10
else:
n_best_merged = 1
# find the best couples (location, covariance) on the merged set
selection = random_state.permutation(n_samples)[:n_samples_merged]
locations_merged, covariances_merged, supports_merged, d = select_candidates(
X[selection],
h_merged,
n_trials=(all_best_locations, all_best_covariances),
select=n_best_merged,
cov_computation_method=cov_computation_method,
random_state=random_state,
)
# 3. Finally get the overall best (locations, covariance) couple
if n_samples < 1500:
# directly get the best couple (location, covariance)
location = locations_merged[0]
covariance = covariances_merged[0]
support = np.zeros(n_samples, dtype=bool)
dist = np.zeros(n_samples)
support[selection] = supports_merged[0]
dist[selection] = d[0]
else:
# select the best couple on the full dataset
locations_full, covariances_full, supports_full, d = select_candidates(
X,
n_support,
n_trials=(locations_merged, covariances_merged),
select=1,
cov_computation_method=cov_computation_method,
random_state=random_state,
)
location = locations_full[0]
covariance = covariances_full[0]
support = supports_full[0]
dist = d[0]
elif n_features > 1:
# 1. Find the 10 best couples (location, covariance)
# considering two iterations
n_trials = 30
n_best = 10
locations_best, covariances_best, _, _ = select_candidates(
X,
n_support,
n_trials=n_trials,
select=n_best,
n_iter=2,
cov_computation_method=cov_computation_method,
random_state=random_state,
)
# 2. Select the best couple on the full dataset amongst the 10
locations_full, covariances_full, supports_full, d = select_candidates(
X,
n_support,
n_trials=(locations_best, covariances_best),
select=1,
cov_computation_method=cov_computation_method,
random_state=random_state,
)
location = locations_full[0]
covariance = covariances_full[0]
support = supports_full[0]
dist = d[0]
return location, covariance, support, dist
class MinCovDet(EmpiricalCovariance):
"""Minimum Covariance Determinant (MCD): robust estimator of covariance.
The Minimum Covariance Determinant covariance estimator is to be applied
on Gaussian-distributed data, but could still be relevant on data
drawn from a unimodal, symmetric distribution. It is not meant to be used
with multi-modal data (the algorithm used to fit a MinCovDet object is
likely to fail in such a case).
One should consider projection pursuit methods to deal with multi-modal
datasets.
Read more in the :ref:`User Guide <robust_covariance>`.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored.
assume_centered : bool, default=False
If True, the support of the robust location and the covariance
estimates is computed, and a covariance estimate is recomputed from
it, without centering the data.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, the robust location and covariance are directly computed
with the FastMCD algorithm without additional treatment.
support_fraction : float, default=None
The proportion of points to be included in the support of the raw
MCD estimate. Default is None, which implies that the minimum
value of support_fraction will be used within the algorithm:
`(n_samples + n_features + 1) / 2 * n_samples`. The parameter must be
in the range (0, 1].
random_state : int, RandomState instance or None, default=None
Determines the pseudo random number generator for shuffling the data.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
Attributes
----------
raw_location_ : ndarray of shape (n_features,)
The raw robust estimated location before correction and re-weighting.
raw_covariance_ : ndarray of shape (n_features, n_features)
The raw robust estimated covariance before correction and re-weighting.
raw_support_ : ndarray of shape (n_samples,)
A mask of the observations that have been used to compute
the raw robust estimates of location and shape, before correction
and re-weighting.
location_ : ndarray of shape (n_features,)
Estimated robust location.
covariance_ : ndarray of shape (n_features, n_features)
Estimated robust covariance matrix.
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
support_ : ndarray of shape (n_samples,)
A mask of the observations that have been used to compute
the robust estimates of location and shape.
dist_ : ndarray of shape (n_samples,)
Mahalanobis distances of the training set (on which :meth:`fit` is
called) observations.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
EllipticEnvelope : An object for detecting outliers in
a Gaussian distributed dataset.
EmpiricalCovariance : Maximum likelihood covariance estimator.
GraphicalLasso : Sparse inverse covariance estimation
with an l1-penalized estimator.
GraphicalLassoCV : Sparse inverse covariance with cross-validated
choice of the l1 penalty.
LedoitWolf : LedoitWolf Estimator.
OAS : Oracle Approximating Shrinkage Estimator.
ShrunkCovariance : Covariance estimator with shrinkage.
References
----------
.. [Rouseeuw1984] P. J. Rousseeuw. Least median of squares regression.
J. Am Stat Ass, 79:871, 1984.
.. [Rousseeuw] A Fast Algorithm for the Minimum Covariance Determinant
Estimator, 1999, American Statistical Association and the American
Society for Quality, TECHNOMETRICS
.. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun,
Asymptotics For The Minimum Covariance Determinant Estimator,
The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import MinCovDet
>>> from sklearn.datasets import make_gaussian_quantiles
>>> real_cov = np.array([[.8, .3],
... [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=500)
>>> cov = MinCovDet(random_state=0).fit(X)
>>> cov.covariance_
array([[0.7411..., 0.2535...],
[0.2535..., 0.3053...]])
>>> cov.location_
array([0.0813... , 0.0427...])
"""
_parameter_constraints: dict = {
**EmpiricalCovariance._parameter_constraints,
"support_fraction": [Interval(Real, 0, 1, closed="right"), None],
"random_state": ["random_state"],
}
_nonrobust_covariance = staticmethod(empirical_covariance)
def __init__(
self,
*,
store_precision=True,
assume_centered=False,
support_fraction=None,
random_state=None,
):
self.store_precision = store_precision
self.assume_centered = assume_centered
self.support_fraction = support_fraction
self.random_state = random_state
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit a Minimum Covariance Determinant with the FastMCD algorithm.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(X, ensure_min_samples=2, estimator="MinCovDet")
random_state = check_random_state(self.random_state)
n_samples, n_features = X.shape
# check that the empirical covariance is full rank
if (linalg.svdvals(np.dot(X.T, X)) > 1e-8).sum() != n_features:
warnings.warn(
"The covariance matrix associated to your dataset is not full rank"
)
# compute and store raw estimates
raw_location, raw_covariance, raw_support, raw_dist = fast_mcd(
X,
support_fraction=self.support_fraction,
cov_computation_method=self._nonrobust_covariance,
random_state=random_state,
)
if self.assume_centered:
raw_location = np.zeros(n_features)
raw_covariance = self._nonrobust_covariance(
X[raw_support], assume_centered=True
)
# get precision matrix in an optimized way
precision = linalg.pinvh(raw_covariance)
raw_dist = np.sum(np.dot(X, precision) * X, 1)
self.raw_location_ = raw_location
self.raw_covariance_ = raw_covariance
self.raw_support_ = raw_support
self.location_ = raw_location
self.support_ = raw_support
self.dist_ = raw_dist
# obtain consistency at normal models
self.correct_covariance(X)
# re-weight estimator
self.reweight_covariance(X)
return self
def correct_covariance(self, data):
"""Apply a correction to raw Minimum Covariance Determinant estimates.
Correction using the empirical correction factor suggested
by Rousseeuw and Van Driessen in [RVD]_.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The data matrix, with p features and n samples.
The data set must be the one which was used to compute
the raw estimates.
Returns
-------
covariance_corrected : ndarray of shape (n_features, n_features)
Corrected robust covariance estimate.
References
----------
.. [RVD] A Fast Algorithm for the Minimum Covariance
Determinant Estimator, 1999, American Statistical Association
and the American Society for Quality, TECHNOMETRICS
"""
# Check that the covariance of the support data is not equal to 0.
# Otherwise self.dist_ = 0 and thus correction = 0.
n_samples = len(self.dist_)
n_support = np.sum(self.support_)
if n_support < n_samples and np.allclose(self.raw_covariance_, 0):
raise ValueError(
"The covariance matrix of the support data "
"is equal to 0, try to increase support_fraction"
)
correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5)
covariance_corrected = self.raw_covariance_ * correction
self.dist_ /= correction
return covariance_corrected
def reweight_covariance(self, data):
"""Re-weight raw Minimum Covariance Determinant estimates.
Re-weight observations using Rousseeuw's method (equivalent to
deleting outlying observations from the data set before
computing location and covariance estimates) described
in [RVDriessen]_.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The data matrix, with p features and n samples.
The data set must be the one which was used to compute
the raw estimates.
Returns
-------
location_reweighted : ndarray of shape (n_features,)
Re-weighted robust location estimate.
covariance_reweighted : ndarray of shape (n_features, n_features)
Re-weighted robust covariance estimate.
support_reweighted : ndarray of shape (n_samples,), dtype=bool
A mask of the observations that have been used to compute
the re-weighted robust location and covariance estimates.
References
----------
.. [RVDriessen] A Fast Algorithm for the Minimum Covariance
Determinant Estimator, 1999, American Statistical Association
and the American Society for Quality, TECHNOMETRICS
"""
n_samples, n_features = data.shape
mask = self.dist_ < chi2(n_features).isf(0.025)
if self.assume_centered:
location_reweighted = np.zeros(n_features)
else:
location_reweighted = data[mask].mean(0)
covariance_reweighted = self._nonrobust_covariance(
data[mask], assume_centered=self.assume_centered
)
support_reweighted = np.zeros(n_samples, dtype=bool)
support_reweighted[mask] = True
self._set_covariance(covariance_reweighted)
self.location_ = location_reweighted
self.support_ = support_reweighted
X_centered = data - self.location_
self.dist_ = np.sum(np.dot(X_centered, self.get_precision()) * X_centered, 1)
return location_reweighted, covariance_reweighted, support_reweighted
@@ -0,0 +1,816 @@
"""
Covariance estimators using shrinkage.
Shrinkage corresponds to regularising `cov` using a convex combination:
shrunk_cov = (1-shrinkage)*cov + shrinkage*structured_estimate.
"""
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
# avoid division truncation
import warnings
from numbers import Integral, Real
import numpy as np
from ..base import _fit_context
from ..utils import check_array
from ..utils._param_validation import Interval, validate_params
from . import EmpiricalCovariance, empirical_covariance
def _ledoit_wolf(X, *, assume_centered, block_size):
"""Estimate the shrunk Ledoit-Wolf covariance matrix."""
# for only one feature, the result is the same whatever the shrinkage
if len(X.shape) == 2 and X.shape[1] == 1:
if not assume_centered:
X = X - X.mean()
return np.atleast_2d((X**2).mean()), 0.0
n_features = X.shape[1]
# get Ledoit-Wolf shrinkage
shrinkage = ledoit_wolf_shrinkage(
X, assume_centered=assume_centered, block_size=block_size
)
emp_cov = empirical_covariance(X, assume_centered=assume_centered)
mu = np.sum(np.trace(emp_cov)) / n_features
shrunk_cov = (1.0 - shrinkage) * emp_cov
shrunk_cov.flat[:: n_features + 1] += shrinkage * mu
return shrunk_cov, shrinkage
def _oas(X, *, assume_centered=False):
"""Estimate covariance with the Oracle Approximating Shrinkage algorithm.
The formulation is based on [1]_.
[1] "Shrinkage algorithms for MMSE covariance estimation.",
Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
https://arxiv.org/pdf/0907.4698.pdf
"""
if len(X.shape) == 2 and X.shape[1] == 1:
# for only one feature, the result is the same whatever the shrinkage
if not assume_centered:
X = X - X.mean()
return np.atleast_2d((X**2).mean()), 0.0
n_samples, n_features = X.shape
emp_cov = empirical_covariance(X, assume_centered=assume_centered)
# The shrinkage is defined as:
# shrinkage = min(
# trace(S @ S.T) + trace(S)**2) / ((n + 1) (trace(S @ S.T) - trace(S)**2 / p), 1
# )
# where n and p are n_samples and n_features, respectively (cf. Eq. 23 in [1]).
# The factor 2 / p is omitted since it does not impact the value of the estimator
# for large p.
# Instead of computing trace(S)**2, we can compute the average of the squared
# elements of S that is equal to trace(S)**2 / p**2.
# See the definition of the Frobenius norm:
# https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm
alpha = np.mean(emp_cov**2)
mu = np.trace(emp_cov) / n_features
mu_squared = mu**2
# The factor 1 / p**2 will cancel out since it is in both the numerator and
# denominator
num = alpha + mu_squared
den = (n_samples + 1) * (alpha - mu_squared / n_features)
shrinkage = 1.0 if den == 0 else min(num / den, 1.0)
# The shrunk covariance is defined as:
# (1 - shrinkage) * S + shrinkage * F (cf. Eq. 4 in [1])
# where S is the empirical covariance and F is the shrinkage target defined as
# F = trace(S) / n_features * np.identity(n_features) (cf. Eq. 3 in [1])
shrunk_cov = (1.0 - shrinkage) * emp_cov
shrunk_cov.flat[:: n_features + 1] += shrinkage * mu
return shrunk_cov, shrinkage
###############################################################################
# Public API
# ShrunkCovariance estimator
@validate_params(
{
"emp_cov": ["array-like"],
"shrinkage": [Interval(Real, 0, 1, closed="both")],
},
prefer_skip_nested_validation=True,
)
def shrunk_covariance(emp_cov, shrinkage=0.1):
"""Calculate covariance matrices shrunk on the diagonal.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
emp_cov : array-like of shape (..., n_features, n_features)
Covariance matrices to be shrunk, at least 2D ndarray.
shrinkage : float, default=0.1
Coefficient in the convex combination used for the computation
of the shrunk estimate. Range is [0, 1].
Returns
-------
shrunk_cov : ndarray of shape (..., n_features, n_features)
Shrunk covariance matrices.
Notes
-----
The regularized (shrunk) covariance is given by::
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where `mu = trace(cov) / n_features`.
Examples
--------
>>> import numpy as np
>>> from sklearn.datasets import make_gaussian_quantiles
>>> from sklearn.covariance import empirical_covariance, shrunk_covariance
>>> real_cov = np.array([[.8, .3], [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500)
>>> shrunk_covariance(empirical_covariance(X))
array([[0.73..., 0.25...],
[0.25..., 0.41...]])
"""
emp_cov = check_array(emp_cov, allow_nd=True)
n_features = emp_cov.shape[-1]
shrunk_cov = (1.0 - shrinkage) * emp_cov
mu = np.trace(emp_cov, axis1=-2, axis2=-1) / n_features
mu = np.expand_dims(mu, axis=tuple(range(mu.ndim, emp_cov.ndim)))
shrunk_cov += shrinkage * mu * np.eye(n_features)
return shrunk_cov
class ShrunkCovariance(EmpiricalCovariance):
"""Covariance estimator with shrinkage.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False, data will be centered before computation.
shrinkage : float, default=0.1
Coefficient in the convex combination used for the computation
of the shrunk estimate. Range is [0, 1].
Attributes
----------
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
EllipticEnvelope : An object for detecting outliers in
a Gaussian distributed dataset.
EmpiricalCovariance : Maximum likelihood covariance estimator.
GraphicalLasso : Sparse inverse covariance estimation
with an l1-penalized estimator.
GraphicalLassoCV : Sparse inverse covariance with cross-validated
choice of the l1 penalty.
LedoitWolf : LedoitWolf Estimator.
MinCovDet : Minimum Covariance Determinant
(robust estimator of covariance).
OAS : Oracle Approximating Shrinkage Estimator.
Notes
-----
The regularized covariance is given by:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import ShrunkCovariance
>>> from sklearn.datasets import make_gaussian_quantiles
>>> real_cov = np.array([[.8, .3],
... [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=500)
>>> cov = ShrunkCovariance().fit(X)
>>> cov.covariance_
array([[0.7387..., 0.2536...],
[0.2536..., 0.4110...]])
>>> cov.location_
array([0.0622..., 0.0193...])
"""
_parameter_constraints: dict = {
**EmpiricalCovariance._parameter_constraints,
"shrinkage": [Interval(Real, 0, 1, closed="both")],
}
def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1):
super().__init__(
store_precision=store_precision, assume_centered=assume_centered
)
self.shrinkage = shrinkage
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the shrunk covariance model to X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(X)
# Not calling the parent object to fit, to avoid a potential
# matrix inversion when setting the precision
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
covariance = empirical_covariance(X, assume_centered=self.assume_centered)
covariance = shrunk_covariance(covariance, self.shrinkage)
self._set_covariance(covariance)
return self
# Ledoit-Wolf estimator
@validate_params(
{
"X": ["array-like"],
"assume_centered": ["boolean"],
"block_size": [Interval(Integral, 1, None, closed="left")],
},
prefer_skip_nested_validation=True,
)
def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000):
"""Estimate the shrunk Ledoit-Wolf covariance matrix.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, data will be centered before computation.
block_size : int, default=1000
Size of blocks into which the covariance matrix will be split.
Returns
-------
shrinkage : float
Coefficient in the convex combination used for the computation
of the shrunk estimate.
Notes
-----
The regularized (shrunk) covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import ledoit_wolf_shrinkage
>>> real_cov = np.array([[.4, .2], [.2, .8]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50)
>>> shrinkage_coefficient = ledoit_wolf_shrinkage(X)
>>> shrinkage_coefficient
0.23...
"""
X = check_array(X)
# for only one feature, the result is the same whatever the shrinkage
if len(X.shape) == 2 and X.shape[1] == 1:
return 0.0
if X.ndim == 1:
X = np.reshape(X, (1, -1))
if X.shape[0] == 1:
warnings.warn(
"Only one sample available. You may want to reshape your data array"
)
n_samples, n_features = X.shape
# optionally center data
if not assume_centered:
X = X - X.mean(0)
# A non-blocked version of the computation is present in the tests
# in tests/test_covariance.py
# number of blocks to split the covariance matrix into
n_splits = int(n_features / block_size)
X2 = X**2
emp_cov_trace = np.sum(X2, axis=0) / n_samples
mu = np.sum(emp_cov_trace) / n_features
beta_ = 0.0 # sum of the coefficients of <X2.T, X2>
delta_ = 0.0 # sum of the *squared* coefficients of <X.T, X>
# starting block computation
for i in range(n_splits):
for j in range(n_splits):
rows = slice(block_size * i, block_size * (i + 1))
cols = slice(block_size * j, block_size * (j + 1))
beta_ += np.sum(np.dot(X2.T[rows], X2[:, cols]))
delta_ += np.sum(np.dot(X.T[rows], X[:, cols]) ** 2)
rows = slice(block_size * i, block_size * (i + 1))
beta_ += np.sum(np.dot(X2.T[rows], X2[:, block_size * n_splits :]))
delta_ += np.sum(np.dot(X.T[rows], X[:, block_size * n_splits :]) ** 2)
for j in range(n_splits):
cols = slice(block_size * j, block_size * (j + 1))
beta_ += np.sum(np.dot(X2.T[block_size * n_splits :], X2[:, cols]))
delta_ += np.sum(np.dot(X.T[block_size * n_splits :], X[:, cols]) ** 2)
delta_ += np.sum(
np.dot(X.T[block_size * n_splits :], X[:, block_size * n_splits :]) ** 2
)
delta_ /= n_samples**2
beta_ += np.sum(
np.dot(X2.T[block_size * n_splits :], X2[:, block_size * n_splits :])
)
# use delta_ to compute beta
beta = 1.0 / (n_features * n_samples) * (beta_ / n_samples - delta_)
# delta is the sum of the squared coefficients of (<X.T,X> - mu*Id) / p
delta = delta_ - 2.0 * mu * emp_cov_trace.sum() + n_features * mu**2
delta /= n_features
# get final beta as the min between beta and delta
# We do this to prevent shrinking more than "1", which would invert
# the value of covariances
beta = min(beta, delta)
# finally get shrinkage
shrinkage = 0 if beta == 0 else beta / delta
return shrinkage
@validate_params(
{"X": ["array-like"]},
prefer_skip_nested_validation=False,
)
def ledoit_wolf(X, *, assume_centered=False, block_size=1000):
"""Estimate the shrunk Ledoit-Wolf covariance matrix.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data from which to compute the covariance estimate.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, data will be centered before computation.
block_size : int, default=1000
Size of blocks into which the covariance matrix will be split.
This is purely a memory optimization and does not affect results.
Returns
-------
shrunk_cov : ndarray of shape (n_features, n_features)
Shrunk covariance.
shrinkage : float
Coefficient in the convex combination used for the computation
of the shrunk estimate.
Notes
-----
The regularized (shrunk) covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import empirical_covariance, ledoit_wolf
>>> real_cov = np.array([[.4, .2], [.2, .8]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50)
>>> covariance, shrinkage = ledoit_wolf(X)
>>> covariance
array([[0.44..., 0.16...],
[0.16..., 0.80...]])
>>> shrinkage
0.23...
"""
estimator = LedoitWolf(
assume_centered=assume_centered,
block_size=block_size,
store_precision=False,
).fit(X)
return estimator.covariance_, estimator.shrinkage_
class LedoitWolf(EmpiricalCovariance):
"""LedoitWolf Estimator.
Ledoit-Wolf is a particular form of shrinkage, where the shrinkage
coefficient is computed using O. Ledoit and M. Wolf's formula as
described in "A Well-Conditioned Estimator for Large-Dimensional
Covariance Matrices", Ledoit and Wolf, Journal of Multivariate
Analysis, Volume 88, Issue 2, February 2004, pages 365-411.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False (default), data will be centered before computation.
block_size : int, default=1000
Size of blocks into which the covariance matrix will be split
during its Ledoit-Wolf estimation. This is purely a memory
optimization and does not affect results.
Attributes
----------
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix.
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
shrinkage_ : float
Coefficient in the convex combination used for the computation
of the shrunk estimate. Range is [0, 1].
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
EllipticEnvelope : An object for detecting outliers in
a Gaussian distributed dataset.
EmpiricalCovariance : Maximum likelihood covariance estimator.
GraphicalLasso : Sparse inverse covariance estimation
with an l1-penalized estimator.
GraphicalLassoCV : Sparse inverse covariance with cross-validated
choice of the l1 penalty.
MinCovDet : Minimum Covariance Determinant
(robust estimator of covariance).
OAS : Oracle Approximating Shrinkage Estimator.
ShrunkCovariance : Covariance estimator with shrinkage.
Notes
-----
The regularised covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
and shrinkage is given by the Ledoit and Wolf formula (see References)
References
----------
"A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices",
Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,
February 2004, pages 365-411.
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import LedoitWolf
>>> real_cov = np.array([[.4, .2],
... [.2, .8]])
>>> np.random.seed(0)
>>> X = np.random.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=50)
>>> cov = LedoitWolf().fit(X)
>>> cov.covariance_
array([[0.4406..., 0.1616...],
[0.1616..., 0.8022...]])
>>> cov.location_
array([ 0.0595... , -0.0075...])
"""
_parameter_constraints: dict = {
**EmpiricalCovariance._parameter_constraints,
"block_size": [Interval(Integral, 1, None, closed="left")],
}
def __init__(self, *, store_precision=True, assume_centered=False, block_size=1000):
super().__init__(
store_precision=store_precision, assume_centered=assume_centered
)
self.block_size = block_size
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the Ledoit-Wolf shrunk covariance model to X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
# Not calling the parent object to fit, to avoid computing the
# covariance matrix (and potentially the precision)
X = self._validate_data(X)
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
covariance, shrinkage = _ledoit_wolf(
X - self.location_, assume_centered=True, block_size=self.block_size
)
self.shrinkage_ = shrinkage
self._set_covariance(covariance)
return self
# OAS estimator
@validate_params(
{"X": ["array-like"]},
prefer_skip_nested_validation=False,
)
def oas(X, *, assume_centered=False):
"""Estimate covariance with the Oracle Approximating Shrinkage as proposed in [1]_.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data from which to compute the covariance estimate.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, data will be centered before computation.
Returns
-------
shrunk_cov : array-like of shape (n_features, n_features)
Shrunk covariance.
shrinkage : float
Coefficient in the convex combination used for the computation
of the shrunk estimate.
Notes
-----
The regularised covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
(see [1]_).
The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
the original article, formula (23) states that 2/p (p being the number of
features) is multiplied by Trace(cov*cov) in both the numerator and
denominator, but this operation is omitted because for a large p, the value
of 2/p is so small that it doesn't affect the value of the estimator.
References
----------
.. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.",
Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
<0907.4698>`
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import oas
>>> rng = np.random.RandomState(0)
>>> real_cov = [[.8, .3], [.3, .4]]
>>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500)
>>> shrunk_cov, shrinkage = oas(X)
>>> shrunk_cov
array([[0.7533..., 0.2763...],
[0.2763..., 0.3964...]])
>>> shrinkage
0.0195...
"""
estimator = OAS(
assume_centered=assume_centered,
).fit(X)
return estimator.covariance_, estimator.shrinkage_
class OAS(EmpiricalCovariance):
"""Oracle Approximating Shrinkage Estimator as proposed in [1]_.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False (default), data will be centered before computation.
Attributes
----------
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix.
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
shrinkage_ : float
coefficient in the convex combination used for the computation
of the shrunk estimate. Range is [0, 1].
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
EllipticEnvelope : An object for detecting outliers in
a Gaussian distributed dataset.
EmpiricalCovariance : Maximum likelihood covariance estimator.
GraphicalLasso : Sparse inverse covariance estimation
with an l1-penalized estimator.
GraphicalLassoCV : Sparse inverse covariance with cross-validated
choice of the l1 penalty.
LedoitWolf : LedoitWolf Estimator.
MinCovDet : Minimum Covariance Determinant
(robust estimator of covariance).
ShrunkCovariance : Covariance estimator with shrinkage.
Notes
-----
The regularised covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features),
where mu = trace(cov) / n_features and shrinkage is given by the OAS formula
(see [1]_).
The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In
the original article, formula (23) states that 2/p (p being the number of
features) is multiplied by Trace(cov*cov) in both the numerator and
denominator, but this operation is omitted because for a large p, the value
of 2/p is so small that it doesn't affect the value of the estimator.
References
----------
.. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.",
Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O.
IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010.
<0907.4698>`
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import OAS
>>> from sklearn.datasets import make_gaussian_quantiles
>>> real_cov = np.array([[.8, .3],
... [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=500)
>>> oas = OAS().fit(X)
>>> oas.covariance_
array([[0.7533..., 0.2763...],
[0.2763..., 0.3964...]])
>>> oas.precision_
array([[ 1.7833..., -1.2431... ],
[-1.2431..., 3.3889...]])
>>> oas.shrinkage_
0.0195...
"""
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the Oracle Approximating Shrinkage covariance model to X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(X)
# Not calling the parent object to fit, to avoid computing the
# covariance matrix (and potentially the precision)
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
covariance, shrinkage = _oas(X - self.location_, assume_centered=True)
self.shrinkage_ = shrinkage
self._set_covariance(covariance)
return self
@@ -0,0 +1,377 @@
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
import numpy as np
import pytest
from sklearn import datasets
from sklearn.covariance import (
OAS,
EmpiricalCovariance,
LedoitWolf,
ShrunkCovariance,
empirical_covariance,
ledoit_wolf,
ledoit_wolf_shrinkage,
oas,
shrunk_covariance,
)
from sklearn.covariance._shrunk_covariance import _ledoit_wolf
from sklearn.utils._testing import (
assert_allclose,
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
from .._shrunk_covariance import _oas
X, _ = datasets.load_diabetes(return_X_y=True)
X_1d = X[:, 0]
n_samples, n_features = X.shape
def test_covariance():
# Tests Covariance module on a simple dataset.
# test covariance fit from data
cov = EmpiricalCovariance()
cov.fit(X)
emp_cov = empirical_covariance(X)
assert_array_almost_equal(emp_cov, cov.covariance_, 4)
assert_almost_equal(cov.error_norm(emp_cov), 0)
assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0)
assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0)
assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
with pytest.raises(NotImplementedError):
cov.error_norm(emp_cov, norm="foo")
# Mahalanobis distances computation test
mahal_dist = cov.mahalanobis(X)
assert np.amin(mahal_dist) > 0
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
cov = EmpiricalCovariance()
cov.fit(X_1d)
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0)
# test with one sample
# Create X with 1 sample and 5 features
X_1sample = np.arange(5).reshape(1, 5)
cov = EmpiricalCovariance()
warn_msg = "Only one sample available. You may want to reshape your data array"
with pytest.warns(UserWarning, match=warn_msg):
cov.fit(X_1sample)
assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
# test integer type
X_integer = np.asarray([[0, 1], [1, 0]])
result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
assert_array_almost_equal(empirical_covariance(X_integer), result)
# test centered case
cov = EmpiricalCovariance(assume_centered=True)
cov.fit(X)
assert_array_equal(cov.location_, np.zeros(X.shape[1]))
@pytest.mark.parametrize("n_matrices", [1, 3])
def test_shrunk_covariance_func(n_matrices):
"""Check `shrunk_covariance` function."""
n_features = 2
cov = np.ones((n_features, n_features))
cov_target = np.array([[1, 0.5], [0.5, 1]])
if n_matrices > 1:
cov = np.repeat(cov[np.newaxis, ...], n_matrices, axis=0)
cov_target = np.repeat(cov_target[np.newaxis, ...], n_matrices, axis=0)
cov_shrunk = shrunk_covariance(cov, 0.5)
assert_allclose(cov_shrunk, cov_target)
def test_shrunk_covariance():
"""Check consistency between `ShrunkCovariance` and `shrunk_covariance`."""
# Tests ShrunkCovariance module on a simple dataset.
# compare shrunk covariance obtained from data and from MLE estimate
cov = ShrunkCovariance(shrinkage=0.5)
cov.fit(X)
assert_array_almost_equal(
shrunk_covariance(empirical_covariance(X), shrinkage=0.5), cov.covariance_, 4
)
# same test with shrinkage not provided
cov = ShrunkCovariance()
cov.fit(X)
assert_array_almost_equal(
shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4
)
# same test with shrinkage = 0 (<==> empirical_covariance)
cov = ShrunkCovariance(shrinkage=0.0)
cov.fit(X)
assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
cov = ShrunkCovariance(shrinkage=0.3)
cov.fit(X_1d)
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
# test shrinkage coeff on a simple data set (without saving precision)
cov = ShrunkCovariance(shrinkage=0.5, store_precision=False)
cov.fit(X)
assert cov.precision_ is None
def test_ledoit_wolf():
# Tests LedoitWolf module on a simple dataset.
# test shrinkage coeff on a simple data set
X_centered = X - X.mean(axis=0)
lw = LedoitWolf(assume_centered=True)
lw.fit(X_centered)
shrinkage_ = lw.shrinkage_
score_ = lw.score(X_centered)
assert_almost_equal(
ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_
)
assert_almost_equal(
ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6),
shrinkage_,
)
# compare shrunk covariance obtained from data and from MLE estimate
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(
X_centered, assume_centered=True
)
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
# compare estimates given by LW and ShrunkCovariance
scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True)
scov.fit(X_centered)
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
lw = LedoitWolf(assume_centered=True)
lw.fit(X_1d)
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True)
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
assert_array_almost_equal((X_1d**2).sum() / n_samples, lw.covariance_, 4)
# test shrinkage coeff on a simple data set (without saving precision)
lw = LedoitWolf(store_precision=False, assume_centered=True)
lw.fit(X_centered)
assert_almost_equal(lw.score(X_centered), score_, 4)
assert lw.precision_ is None
# Same tests without assuming centered data
# test shrinkage coeff on a simple data set
lw = LedoitWolf()
lw.fit(X)
assert_almost_equal(lw.shrinkage_, shrinkage_, 4)
assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X))
assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1])
assert_almost_equal(
lw.shrinkage_, _ledoit_wolf(X=X, assume_centered=False, block_size=10000)[1]
)
assert_almost_equal(lw.score(X), score_, 4)
# compare shrunk covariance obtained from data and from MLE estimate
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X)
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
# compare estimates given by LW and ShrunkCovariance
scov = ShrunkCovariance(shrinkage=lw.shrinkage_)
scov.fit(X)
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
lw = LedoitWolf()
lw.fit(X_1d)
assert_allclose(
X_1d.var(ddof=0),
_ledoit_wolf(X=X_1d, assume_centered=False, block_size=10000)[0],
)
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d)
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4)
# test with one sample
# warning should be raised when using only 1 sample
X_1sample = np.arange(5).reshape(1, 5)
lw = LedoitWolf()
warn_msg = "Only one sample available. You may want to reshape your data array"
with pytest.warns(UserWarning, match=warn_msg):
lw.fit(X_1sample)
assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
# test shrinkage coeff on a simple data set (without saving precision)
lw = LedoitWolf(store_precision=False)
lw.fit(X)
assert_almost_equal(lw.score(X), score_, 4)
assert lw.precision_ is None
def _naive_ledoit_wolf_shrinkage(X):
# A simple implementation of the formulas from Ledoit & Wolf
# The computation below achieves the following computations of the
# "O. Ledoit and M. Wolf, A Well-Conditioned Estimator for
# Large-Dimensional Covariance Matrices"
# beta and delta are given in the beginning of section 3.2
n_samples, n_features = X.shape
emp_cov = empirical_covariance(X, assume_centered=False)
mu = np.trace(emp_cov) / n_features
delta_ = emp_cov.copy()
delta_.flat[:: n_features + 1] -= mu
delta = (delta_**2).sum() / n_features
X2 = X**2
beta_ = (
1.0
/ (n_features * n_samples)
* np.sum(np.dot(X2.T, X2) / n_samples - emp_cov**2)
)
beta = min(beta_, delta)
shrinkage = beta / delta
return shrinkage
def test_ledoit_wolf_small():
# Compare our blocked implementation to the naive implementation
X_small = X[:, :4]
lw = LedoitWolf()
lw.fit(X_small)
shrinkage_ = lw.shrinkage_
assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small))
def test_ledoit_wolf_large():
# test that ledoit_wolf doesn't error on data that is wider than block_size
rng = np.random.RandomState(0)
# use a number of features that is larger than the block-size
X = rng.normal(size=(10, 20))
lw = LedoitWolf(block_size=10).fit(X)
# check that covariance is about diagonal (random normal noise)
assert_almost_equal(lw.covariance_, np.eye(20), 0)
cov = lw.covariance_
# check that the result is consistent with not splitting data into blocks.
lw = LedoitWolf(block_size=25).fit(X)
assert_almost_equal(lw.covariance_, cov)
@pytest.mark.parametrize(
"ledoit_wolf_fitting_function", [LedoitWolf().fit, ledoit_wolf_shrinkage]
)
def test_ledoit_wolf_empty_array(ledoit_wolf_fitting_function):
"""Check that we validate X and raise proper error with 0-sample array."""
X_empty = np.zeros((0, 2))
with pytest.raises(ValueError, match="Found array with 0 sample"):
ledoit_wolf_fitting_function(X_empty)
def test_oas():
# Tests OAS module on a simple dataset.
# test shrinkage coeff on a simple data set
X_centered = X - X.mean(axis=0)
oa = OAS(assume_centered=True)
oa.fit(X_centered)
shrinkage_ = oa.shrinkage_
score_ = oa.score(X_centered)
# compare shrunk covariance obtained from data and from MLE estimate
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_centered, assume_centered=True)
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
# compare estimates given by OAS and ShrunkCovariance
scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True)
scov.fit(X_centered)
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0:1]
oa = OAS(assume_centered=True)
oa.fit(X_1d)
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d, assume_centered=True)
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
assert_array_almost_equal((X_1d**2).sum() / n_samples, oa.covariance_, 4)
# test shrinkage coeff on a simple data set (without saving precision)
oa = OAS(store_precision=False, assume_centered=True)
oa.fit(X_centered)
assert_almost_equal(oa.score(X_centered), score_, 4)
assert oa.precision_ is None
# Same tests without assuming centered data--------------------------------
# test shrinkage coeff on a simple data set
oa = OAS()
oa.fit(X)
assert_almost_equal(oa.shrinkage_, shrinkage_, 4)
assert_almost_equal(oa.score(X), score_, 4)
# compare shrunk covariance obtained from data and from MLE estimate
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X)
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
# compare estimates given by OAS and ShrunkCovariance
scov = ShrunkCovariance(shrinkage=oa.shrinkage_)
scov.fit(X)
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
oa = OAS()
oa.fit(X_1d)
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d)
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4)
# test with one sample
# warning should be raised when using only 1 sample
X_1sample = np.arange(5).reshape(1, 5)
oa = OAS()
warn_msg = "Only one sample available. You may want to reshape your data array"
with pytest.warns(UserWarning, match=warn_msg):
oa.fit(X_1sample)
assert_array_almost_equal(oa.covariance_, np.zeros(shape=(5, 5), dtype=np.float64))
# test shrinkage coeff on a simple data set (without saving precision)
oa = OAS(store_precision=False)
oa.fit(X)
assert_almost_equal(oa.score(X), score_, 4)
assert oa.precision_ is None
# test function _oas without assuming centered data
X_1f = X[:, 0:1]
oa = OAS()
oa.fit(X_1f)
# compare shrunk covariance obtained from data and from MLE estimate
_oa_cov_from_mle, _oa_shrinkage_from_mle = _oas(X_1f)
assert_array_almost_equal(_oa_cov_from_mle, oa.covariance_, 4)
assert_almost_equal(_oa_shrinkage_from_mle, oa.shrinkage_)
assert_array_almost_equal((X_1f**2).sum() / n_samples, oa.covariance_, 4)
def test_EmpiricalCovariance_validates_mahalanobis():
"""Checks that EmpiricalCovariance validates data with mahalanobis."""
cov = EmpiricalCovariance().fit(X)
msg = f"X has 2 features, but \\w+ is expecting {X.shape[1]} features as input"
with pytest.raises(ValueError, match=msg):
cov.mahalanobis(X[:, :2])
@@ -0,0 +1,52 @@
"""
Testing for Elliptic Envelope algorithm (sklearn.covariance.elliptic_envelope).
"""
import numpy as np
import pytest
from sklearn.covariance import EllipticEnvelope
from sklearn.exceptions import NotFittedError
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
def test_elliptic_envelope(global_random_seed):
rnd = np.random.RandomState(global_random_seed)
X = rnd.randn(100, 10)
clf = EllipticEnvelope(contamination=0.1)
with pytest.raises(NotFittedError):
clf.predict(X)
with pytest.raises(NotFittedError):
clf.decision_function(X)
clf.fit(X)
y_pred = clf.predict(X)
scores = clf.score_samples(X)
decisions = clf.decision_function(X)
assert_array_almost_equal(scores, -clf.mahalanobis(X))
assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
assert_almost_equal(
clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0
)
assert sum(y_pred == -1) == sum(decisions < 0)
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
clf2 = EllipticEnvelope().fit(X_train)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]),
clf1.decision_function([[2.0, 2.0]]) + clf1.offset_,
)
assert_array_equal(
clf2.score_samples([[2.0, 2.0]]),
clf2.decision_function([[2.0, 2.0]]) + clf2.offset_,
)
assert_array_equal(
clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]])
)
@@ -0,0 +1,318 @@
"""Test the graphical_lasso module."""
import sys
from io import StringIO
import numpy as np
import pytest
from numpy.testing import assert_allclose
from scipy import linalg
from sklearn import datasets
from sklearn.covariance import (
GraphicalLasso,
GraphicalLassoCV,
empirical_covariance,
graphical_lasso,
)
from sklearn.datasets import make_sparse_spd_matrix
from sklearn.model_selection import GroupKFold
from sklearn.utils import check_random_state
from sklearn.utils._testing import (
_convert_container,
assert_array_almost_equal,
assert_array_less,
)
def test_graphical_lassos(random_state=1):
"""Test the graphical lasso solvers.
This checks is unstable for some random seeds where the covariance found with "cd"
and "lars" solvers are different (4 cases / 100 tries).
"""
# Sample data from a sparse multivariate normal
dim = 20
n_samples = 100
random_state = check_random_state(random_state)
prec = make_sparse_spd_matrix(dim, alpha=0.95, random_state=random_state)
cov = linalg.inv(prec)
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
emp_cov = empirical_covariance(X)
for alpha in (0.0, 0.1, 0.25):
covs = dict()
icovs = dict()
for method in ("cd", "lars"):
cov_, icov_, costs = graphical_lasso(
emp_cov, return_costs=True, alpha=alpha, mode=method
)
covs[method] = cov_
icovs[method] = icov_
costs, dual_gap = np.array(costs).T
# Check that the costs always decrease (doesn't hold if alpha == 0)
if not alpha == 0:
# use 1e-12 since the cost can be exactly 0
assert_array_less(np.diff(costs), 1e-12)
# Check that the 2 approaches give similar results
assert_allclose(covs["cd"], covs["lars"], atol=5e-4)
assert_allclose(icovs["cd"], icovs["lars"], atol=5e-4)
# Smoke test the estimator
model = GraphicalLasso(alpha=0.25).fit(X)
model.score(X)
assert_array_almost_equal(model.covariance_, covs["cd"], decimal=4)
assert_array_almost_equal(model.covariance_, covs["lars"], decimal=4)
# For a centered matrix, assume_centered could be chosen True or False
# Check that this returns indeed the same result for centered data
Z = X - X.mean(0)
precs = list()
for assume_centered in (False, True):
prec_ = GraphicalLasso(assume_centered=assume_centered).fit(Z).precision_
precs.append(prec_)
assert_array_almost_equal(precs[0], precs[1])
def test_graphical_lasso_when_alpha_equals_0():
"""Test graphical_lasso's early return condition when alpha=0."""
X = np.random.randn(100, 10)
emp_cov = empirical_covariance(X, assume_centered=True)
model = GraphicalLasso(alpha=0, covariance="precomputed").fit(emp_cov)
assert_allclose(model.precision_, np.linalg.inv(emp_cov))
_, precision = graphical_lasso(emp_cov, alpha=0)
assert_allclose(precision, np.linalg.inv(emp_cov))
@pytest.mark.parametrize("mode", ["cd", "lars"])
def test_graphical_lasso_n_iter(mode):
X, _ = datasets.make_classification(n_samples=5_000, n_features=20, random_state=0)
emp_cov = empirical_covariance(X)
_, _, n_iter = graphical_lasso(
emp_cov, 0.2, mode=mode, max_iter=2, return_n_iter=True
)
assert n_iter == 2
def test_graphical_lasso_iris():
# Hard-coded solution from R glasso package for alpha=1.0
# (need to set penalize.diagonal to FALSE)
cov_R = np.array(
[
[0.68112222, 0.0000000, 0.265820, 0.02464314],
[0.00000000, 0.1887129, 0.000000, 0.00000000],
[0.26582000, 0.0000000, 3.095503, 0.28697200],
[0.02464314, 0.0000000, 0.286972, 0.57713289],
]
)
icov_R = np.array(
[
[1.5190747, 0.000000, -0.1304475, 0.0000000],
[0.0000000, 5.299055, 0.0000000, 0.0000000],
[-0.1304475, 0.000000, 0.3498624, -0.1683946],
[0.0000000, 0.000000, -0.1683946, 1.8164353],
]
)
X = datasets.load_iris().data
emp_cov = empirical_covariance(X)
for method in ("cd", "lars"):
cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method)
assert_array_almost_equal(cov, cov_R)
assert_array_almost_equal(icov, icov_R)
def test_graph_lasso_2D():
# Hard-coded solution from Python skggm package
# obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)`
cov_skggm = np.array([[3.09550269, 1.186972], [1.186972, 0.57713289]])
icov_skggm = np.array([[1.52836773, -3.14334831], [-3.14334831, 8.19753385]])
X = datasets.load_iris().data[:, 2:]
emp_cov = empirical_covariance(X)
for method in ("cd", "lars"):
cov, icov = graphical_lasso(emp_cov, alpha=0.1, return_costs=False, mode=method)
assert_array_almost_equal(cov, cov_skggm)
assert_array_almost_equal(icov, icov_skggm)
def test_graphical_lasso_iris_singular():
# Small subset of rows to test the rank-deficient case
# Need to choose samples such that none of the variances are zero
indices = np.arange(10, 13)
# Hard-coded solution from R glasso package for alpha=0.01
cov_R = np.array(
[
[0.08, 0.056666662595, 0.00229729713223, 0.00153153142149],
[0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222],
[0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009],
[0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222],
]
)
icov_R = np.array(
[
[24.42244057, -16.831679593, 0.0, 0.0],
[-16.83168201, 24.351841681, -6.206896552, -12.5],
[0.0, -6.206896171, 153.103448276, 0.0],
[0.0, -12.499999143, 0.0, 462.5],
]
)
X = datasets.load_iris().data[indices, :]
emp_cov = empirical_covariance(X)
for method in ("cd", "lars"):
cov, icov = graphical_lasso(
emp_cov, alpha=0.01, return_costs=False, mode=method
)
assert_array_almost_equal(cov, cov_R, decimal=5)
assert_array_almost_equal(icov, icov_R, decimal=5)
def test_graphical_lasso_cv(random_state=1):
# Sample data from a sparse multivariate normal
dim = 5
n_samples = 6
random_state = check_random_state(random_state)
prec = make_sparse_spd_matrix(dim, alpha=0.96, random_state=random_state)
cov = linalg.inv(prec)
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
# Capture stdout, to smoke test the verbose mode
orig_stdout = sys.stdout
try:
sys.stdout = StringIO()
# We need verbose very high so that Parallel prints on stdout
GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X)
finally:
sys.stdout = orig_stdout
@pytest.mark.parametrize("alphas_container_type", ["list", "tuple", "array"])
def test_graphical_lasso_cv_alphas_iterable(alphas_container_type):
"""Check that we can pass an array-like to `alphas`.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/22489
"""
true_cov = np.array(
[
[0.8, 0.0, 0.2, 0.0],
[0.0, 0.4, 0.0, 0.0],
[0.2, 0.0, 0.3, 0.1],
[0.0, 0.0, 0.1, 0.7],
]
)
rng = np.random.RandomState(0)
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
alphas = _convert_container([0.02, 0.03], alphas_container_type)
GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
@pytest.mark.parametrize(
"alphas,err_type,err_msg",
[
([-0.02, 0.03], ValueError, "must be > 0"),
([0, 0.03], ValueError, "must be > 0"),
(["not_number", 0.03], TypeError, "must be an instance of float"),
],
)
def test_graphical_lasso_cv_alphas_invalid_array(alphas, err_type, err_msg):
"""Check that if an array-like containing a value
outside of (0, inf] is passed to `alphas`, a ValueError is raised.
Check if a string is passed, a TypeError is raised.
"""
true_cov = np.array(
[
[0.8, 0.0, 0.2, 0.0],
[0.0, 0.4, 0.0, 0.0],
[0.2, 0.0, 0.3, 0.1],
[0.0, 0.0, 0.1, 0.7],
]
)
rng = np.random.RandomState(0)
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
with pytest.raises(err_type, match=err_msg):
GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X)
def test_graphical_lasso_cv_scores():
splits = 4
n_alphas = 5
n_refinements = 3
true_cov = np.array(
[
[0.8, 0.0, 0.2, 0.0],
[0.0, 0.4, 0.0, 0.0],
[0.2, 0.0, 0.3, 0.1],
[0.0, 0.0, 0.1, 0.7],
]
)
rng = np.random.RandomState(0)
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200)
cov = GraphicalLassoCV(cv=splits, alphas=n_alphas, n_refinements=n_refinements).fit(
X
)
_assert_graphical_lasso_cv_scores(
cov=cov,
n_splits=splits,
n_refinements=n_refinements,
n_alphas=n_alphas,
)
@pytest.mark.usefixtures("enable_slep006")
def test_graphical_lasso_cv_scores_with_routing(global_random_seed):
"""Check that `GraphicalLassoCV` internally dispatches metadata to
the splitter.
"""
splits = 5
n_alphas = 5
n_refinements = 3
true_cov = np.array(
[
[0.8, 0.0, 0.2, 0.0],
[0.0, 0.4, 0.0, 0.0],
[0.2, 0.0, 0.3, 0.1],
[0.0, 0.0, 0.1, 0.7],
]
)
rng = np.random.RandomState(global_random_seed)
X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=300)
n_samples = X.shape[0]
groups = rng.randint(0, 5, n_samples)
params = {"groups": groups}
cv = GroupKFold(n_splits=splits)
cv.set_split_request(groups=True)
cov = GraphicalLassoCV(cv=cv, alphas=n_alphas, n_refinements=n_refinements).fit(
X, **params
)
_assert_graphical_lasso_cv_scores(
cov=cov,
n_splits=splits,
n_refinements=n_refinements,
n_alphas=n_alphas,
)
def _assert_graphical_lasso_cv_scores(cov, n_splits, n_refinements, n_alphas):
cv_results = cov.cv_results_
# alpha and one for each split
total_alphas = n_refinements * n_alphas + 1
keys = ["alphas"]
split_keys = [f"split{i}_test_score" for i in range(n_splits)]
for key in keys + split_keys:
assert key in cv_results
assert len(cv_results[key]) == total_alphas
cv_scores = np.asarray([cov.cv_results_[key] for key in split_keys])
expected_mean = cv_scores.mean(axis=0)
expected_std = cv_scores.std(axis=0)
assert_allclose(cov.cv_results_["mean_test_score"], expected_mean)
assert_allclose(cov.cv_results_["std_test_score"], expected_std)
@@ -0,0 +1,171 @@
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
import itertools
import numpy as np
import pytest
from sklearn import datasets
from sklearn.covariance import MinCovDet, empirical_covariance, fast_mcd
from sklearn.utils._testing import assert_array_almost_equal
X = datasets.load_iris().data
X_1d = X[:, 0]
n_samples, n_features = X.shape
def test_mcd(global_random_seed):
# Tests the FastMCD algorithm implementation
# Small data set
# test without outliers (random independent normal data)
launch_mcd_on_dataset(100, 5, 0, 0.02, 0.1, 75, global_random_seed)
# test with a contaminated data set (medium contamination)
launch_mcd_on_dataset(100, 5, 20, 0.3, 0.3, 65, global_random_seed)
# test with a contaminated data set (strong contamination)
launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50, global_random_seed)
# Medium data set
launch_mcd_on_dataset(1000, 5, 450, 0.1, 0.1, 540, global_random_seed)
# Large data set
launch_mcd_on_dataset(1700, 5, 800, 0.1, 0.1, 870, global_random_seed)
# 1D data set
launch_mcd_on_dataset(500, 1, 100, 0.02, 0.02, 350, global_random_seed)
def test_fast_mcd_on_invalid_input():
X = np.arange(100)
msg = "Expected 2D array, got 1D array instead"
with pytest.raises(ValueError, match=msg):
fast_mcd(X)
def test_mcd_class_on_invalid_input():
X = np.arange(100)
mcd = MinCovDet()
msg = "Expected 2D array, got 1D array instead"
with pytest.raises(ValueError, match=msg):
mcd.fit(X)
def launch_mcd_on_dataset(
n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support, seed
):
rand_gen = np.random.RandomState(seed)
data = rand_gen.randn(n_samples, n_features)
# add some outliers
outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
outliers_offset = 10.0 * (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
data[outliers_index] += outliers_offset
inliers_mask = np.ones(n_samples).astype(bool)
inliers_mask[outliers_index] = False
pure_data = data[inliers_mask]
# compute MCD by fitting an object
mcd_fit = MinCovDet(random_state=seed).fit(data)
T = mcd_fit.location_
S = mcd_fit.covariance_
H = mcd_fit.support_
# compare with the estimates learnt from the inliers
error_location = np.mean((pure_data.mean(0) - T) ** 2)
assert error_location < tol_loc
error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
assert error_cov < tol_cov
assert np.sum(H) >= tol_support
assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
def test_mcd_issue1127():
# Check that the code does not break with X.shape = (3, 1)
# (i.e. n_support = n_samples)
rnd = np.random.RandomState(0)
X = rnd.normal(size=(3, 1))
mcd = MinCovDet()
mcd.fit(X)
def test_mcd_issue3367(global_random_seed):
# Check that MCD completes when the covariance matrix is singular
# i.e. one of the rows and columns are all zeros
rand_gen = np.random.RandomState(global_random_seed)
# Think of these as the values for X and Y -> 10 values between -5 and 5
data_values = np.linspace(-5, 5, 10).tolist()
# Get the cartesian product of all possible coordinate pairs from above set
data = np.array(list(itertools.product(data_values, data_values)))
# Add a third column that's all zeros to make our data a set of point
# within a plane, which means that the covariance matrix will be singular
data = np.hstack((data, np.zeros((data.shape[0], 1))))
# The below line of code should raise an exception if the covariance matrix
# is singular. As a further test, since we have points in XYZ, the
# principle components (Eigenvectors) of these directly relate to the
# geometry of the points. Since it's a plane, we should be able to test
# that the Eigenvector that corresponds to the smallest Eigenvalue is the
# plane normal, specifically [0, 0, 1], since everything is in the XY plane
# (as I've set it up above). To do this one would start by:
#
# evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
# normal = evecs[:, np.argmin(evals)]
#
# After which we need to assert that our `normal` is equal to [0, 0, 1].
# Do note that there is floating point error associated with this, so it's
# best to subtract the two and then compare some small tolerance (e.g.
# 1e-12).
MinCovDet(random_state=rand_gen).fit(data)
def test_mcd_support_covariance_is_zero():
# Check that MCD returns a ValueError with informative message when the
# covariance of the support data is equal to 0.
X_1 = np.array([0.5, 0.1, 0.1, 0.1, 0.957, 0.1, 0.1, 0.1, 0.4285, 0.1])
X_1 = X_1.reshape(-1, 1)
X_2 = np.array([0.5, 0.3, 0.3, 0.3, 0.957, 0.3, 0.3, 0.3, 0.4285, 0.3])
X_2 = X_2.reshape(-1, 1)
msg = (
"The covariance matrix of the support data is equal to 0, try to "
"increase support_fraction"
)
for X in [X_1, X_2]:
with pytest.raises(ValueError, match=msg):
MinCovDet().fit(X)
def test_mcd_increasing_det_warning(global_random_seed):
# Check that a warning is raised if we observe increasing determinants
# during the c_step. In theory the sequence of determinants should be
# decreasing. Increasing determinants are likely due to ill-conditioned
# covariance matrices that result in poor precision matrices.
X = [
[5.1, 3.5, 1.4, 0.2],
[4.9, 3.0, 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5.0, 3.6, 1.4, 0.2],
[4.6, 3.4, 1.4, 0.3],
[5.0, 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5.4, 3.7, 1.5, 0.2],
[4.8, 3.4, 1.6, 0.2],
[4.8, 3.0, 1.4, 0.1],
[4.3, 3.0, 1.1, 0.1],
[5.1, 3.5, 1.4, 0.3],
[5.7, 3.8, 1.7, 0.3],
[5.4, 3.4, 1.7, 0.2],
[4.6, 3.6, 1.0, 0.2],
[5.0, 3.0, 1.6, 0.2],
[5.2, 3.5, 1.5, 0.2],
]
mcd = MinCovDet(support_fraction=0.5, random_state=global_random_seed)
warn_msg = "Determinant has increased"
with pytest.warns(RuntimeWarning, match=warn_msg):
mcd.fit(X)

Some files were not shown because too many files have changed in this diff Show More