feat: initial commit - Phase 1 & 2 core features

This commit is contained in:
hiderfong
2026-04-22 17:07:33 +08:00
commit 1773bda06b
25005 changed files with 6252106 additions and 0 deletions
@@ -0,0 +1,122 @@
"""
The :mod:`sklearn.utils` module includes various utilities.
"""
import warnings
from collections.abc import Sequence
import numpy as np
from ..exceptions import DataConversionWarning
from . import _joblib, metadata_routing
from ._bunch import Bunch
from ._chunking import gen_batches, gen_even_slices
from ._estimator_html_repr import estimator_html_repr
# Make _safe_indexing importable from here for backward compat as this particular
# helper is considered semi-private and typically very useful for third-party
# libraries that want to comply with scikit-learn's estimator API. In particular,
# _safe_indexing was included in our public API documentation despite the leading
# `_` in its name.
from ._indexing import (
_safe_indexing, # noqa
resample,
shuffle,
)
from ._mask import safe_mask
from .class_weight import compute_class_weight, compute_sample_weight
from .deprecation import deprecated
from .discovery import all_estimators
from .extmath import safe_sqr
from .murmurhash import murmurhash3_32
from .validation import (
as_float_array,
assert_all_finite,
check_array,
check_consistent_length,
check_random_state,
check_scalar,
check_symmetric,
check_X_y,
column_or_1d,
indexable,
)
# TODO(1.7): remove parallel_backend and register_parallel_backend
msg = "deprecated in 1.5 to be removed in 1.7. Use joblib.{} instead."
register_parallel_backend = deprecated(msg)(_joblib.register_parallel_backend)
# if a class, deprecated will change the object in _joblib module so we need to subclass
@deprecated(msg)
class parallel_backend(_joblib.parallel_backend):
pass
__all__ = [
"murmurhash3_32",
"as_float_array",
"assert_all_finite",
"check_array",
"check_random_state",
"compute_class_weight",
"compute_sample_weight",
"column_or_1d",
"check_consistent_length",
"check_X_y",
"check_scalar",
"indexable",
"check_symmetric",
"deprecated",
"parallel_backend",
"register_parallel_backend",
"resample",
"shuffle",
"all_estimators",
"DataConversionWarning",
"estimator_html_repr",
"Bunch",
"metadata_routing",
"safe_sqr",
"safe_mask",
"gen_batches",
"gen_even_slices",
]
# TODO(1.7): remove
def __getattr__(name):
if name == "IS_PYPY":
warnings.warn(
"IS_PYPY is deprecated and will be removed in 1.7.",
FutureWarning,
)
from .fixes import _IS_PYPY
return _IS_PYPY
raise AttributeError(f"module {__name__} has no attribute {name}")
# TODO(1.7): remove tosequence
@deprecated("tosequence was deprecated in 1.5 and will be removed in 1.7")
def tosequence(x):
"""Cast iterable x to a Sequence, avoiding a copy if possible.
Parameters
----------
x : iterable
The iterable to be converted.
Returns
-------
x : Sequence
If `x` is a NumPy array, it returns it as a `ndarray`. If `x`
is a `Sequence`, `x` is returned as-is. If `x` is from any other
type, `x` is returned casted as a list.
"""
if isinstance(x, np.ndarray):
return np.asarray(x)
elif isinstance(x, Sequence):
return x
else:
return list(x)
@@ -0,0 +1,30 @@
from .validation import check_random_state
def _init_arpack_v0(size, random_state):
"""Initialize the starting vector for iteration in ARPACK functions.
Initialize a ndarray with values sampled from the uniform distribution on
[-1, 1]. This initialization model has been chosen to be consistent with
the ARPACK one as another initialization can lead to convergence issues.
Parameters
----------
size : int
The size of the eigenvalue vector to be initialized.
random_state : int, RandomState instance or None, default=None
The seed of the pseudo random number generator used to generate a
uniform distribution. If int, random_state is the seed used by the
random number generator; If RandomState instance, random_state is the
random number generator; If None, the random number generator is the
RandomState instance used by `np.random`.
Returns
-------
v0 : ndarray of shape (size,)
The initialized vector.
"""
random_state = check_random_state(random_state)
v0 = random_state.uniform(-1, 1, size)
return v0
@@ -0,0 +1,838 @@
"""Tools to support array_api."""
import itertools
import math
from functools import wraps
import numpy
import scipy.special as special
from .._config import get_config
from .fixes import parse_version
_NUMPY_NAMESPACE_NAMES = {"numpy", "array_api_compat.numpy"}
def yield_namespaces(include_numpy_namespaces=True):
"""Yield supported namespace.
This is meant to be used for testing purposes only.
Parameters
----------
include_numpy_namespaces : bool, default=True
If True, also yield numpy namespaces.
Returns
-------
array_namespace : str
The name of the Array API namespace.
"""
for array_namespace in [
# The following is used to test the array_api_compat wrapper when
# array_api_dispatch is enabled: in particular, the arrays used in the
# tests are regular numpy arrays without any "device" attribute.
"numpy",
# Stricter NumPy-based Array API implementation. The
# array_api_strict.Array instances always have a dummy "device" attribute.
"array_api_strict",
"cupy",
"cupy.array_api",
"torch",
]:
if not include_numpy_namespaces and array_namespace in _NUMPY_NAMESPACE_NAMES:
continue
yield array_namespace
def yield_namespace_device_dtype_combinations(include_numpy_namespaces=True):
"""Yield supported namespace, device, dtype tuples for testing.
Use this to test that an estimator works with all combinations.
Parameters
----------
include_numpy_namespaces : bool, default=True
If True, also yield numpy namespaces.
Returns
-------
array_namespace : str
The name of the Array API namespace.
device : str
The name of the device on which to allocate the arrays. Can be None to
indicate that the default value should be used.
dtype_name : str
The name of the data type to use for arrays. Can be None to indicate
that the default value should be used.
"""
for array_namespace in yield_namespaces(
include_numpy_namespaces=include_numpy_namespaces
):
if array_namespace == "torch":
for device, dtype in itertools.product(
("cpu", "cuda"), ("float64", "float32")
):
yield array_namespace, device, dtype
yield array_namespace, "mps", "float32"
else:
yield array_namespace, None, None
def _check_array_api_dispatch(array_api_dispatch):
"""Check that array_api_compat is installed and NumPy version is compatible.
array_api_compat follows NEP29, which has a higher minimum NumPy version than
scikit-learn.
"""
if array_api_dispatch:
try:
import array_api_compat # noqa
except ImportError:
raise ImportError(
"array_api_compat is required to dispatch arrays using the API"
" specification"
)
numpy_version = parse_version(numpy.__version__)
min_numpy_version = "1.21"
if numpy_version < parse_version(min_numpy_version):
raise ImportError(
f"NumPy must be {min_numpy_version} or newer to dispatch array using"
" the API specification"
)
def _single_array_device(array):
"""Hardware device where the array data resides on."""
if isinstance(array, (numpy.ndarray, numpy.generic)) or not hasattr(
array, "device"
):
return "cpu"
else:
return array.device
def device(*array_list, remove_none=True, remove_types=(str,)):
"""Hardware device where the array data resides on.
If the hardware device is not the same for all arrays, an error is raised.
Parameters
----------
*array_list : arrays
List of array instances from NumPy or an array API compatible library.
remove_none : bool, default=True
Whether to ignore None objects passed in array_list.
remove_types : tuple or list, default=(str,)
Types to ignore in array_list.
Returns
-------
out : device
`device` object (see the "Device Support" section of the array API spec).
"""
array_list = _remove_non_arrays(
*array_list, remove_none=remove_none, remove_types=remove_types
)
# Note that _remove_non_arrays ensures that array_list is not empty.
device_ = _single_array_device(array_list[0])
# Note: here we cannot simply use a Python `set` as it requires
# hashable members which is not guaranteed for Array API device
# objects. In particular, CuPy devices are not hashable at the
# time of writing.
for array in array_list[1:]:
device_other = _single_array_device(array)
if device_ != device_other:
raise ValueError(
f"Input arrays use different devices: {str(device_)}, "
f"{str(device_other)}"
)
return device_
def size(x):
"""Return the total number of elements of x.
Parameters
----------
x : array
Array instance from NumPy or an array API compatible library.
Returns
-------
out : int
Total number of elements.
"""
return math.prod(x.shape)
def _is_numpy_namespace(xp):
"""Return True if xp is backed by NumPy."""
return xp.__name__ in _NUMPY_NAMESPACE_NAMES
def _union1d(a, b, xp):
if _is_numpy_namespace(xp):
return xp.asarray(numpy.union1d(a, b))
assert a.ndim == b.ndim == 1
return xp.unique_values(xp.concat([xp.unique_values(a), xp.unique_values(b)]))
def isdtype(dtype, kind, *, xp):
"""Returns a boolean indicating whether a provided dtype is of type "kind".
Included in the v2022.12 of the Array API spec.
https://data-apis.org/array-api/latest/API_specification/generated/array_api.isdtype.html
"""
if isinstance(kind, tuple):
return any(_isdtype_single(dtype, k, xp=xp) for k in kind)
else:
return _isdtype_single(dtype, kind, xp=xp)
def _isdtype_single(dtype, kind, *, xp):
if isinstance(kind, str):
if kind == "bool":
return dtype == xp.bool
elif kind == "signed integer":
return dtype in {xp.int8, xp.int16, xp.int32, xp.int64}
elif kind == "unsigned integer":
return dtype in {xp.uint8, xp.uint16, xp.uint32, xp.uint64}
elif kind == "integral":
return any(
_isdtype_single(dtype, k, xp=xp)
for k in ("signed integer", "unsigned integer")
)
elif kind == "real floating":
return dtype in supported_float_dtypes(xp)
elif kind == "complex floating":
# Some name spaces do not have complex, such as cupy.array_api
complex_dtypes = set()
if hasattr(xp, "complex64"):
complex_dtypes.add(xp.complex64)
if hasattr(xp, "complex128"):
complex_dtypes.add(xp.complex128)
return dtype in complex_dtypes
elif kind == "numeric":
return any(
_isdtype_single(dtype, k, xp=xp)
for k in ("integral", "real floating", "complex floating")
)
else:
raise ValueError(f"Unrecognized data type kind: {kind!r}")
else:
return dtype == kind
def supported_float_dtypes(xp):
"""Supported floating point types for the namespace.
Note: float16 is not officially part of the Array API spec at the
time of writing but scikit-learn estimators and functions can choose
to accept it when xp.float16 is defined.
https://data-apis.org/array-api/latest/API_specification/data_types.html
"""
if hasattr(xp, "float16"):
return (xp.float64, xp.float32, xp.float16)
else:
return (xp.float64, xp.float32)
def ensure_common_namespace_device(reference, *arrays):
"""Ensure that all arrays use the same namespace and device as reference.
If neccessary the arrays are moved to the same namespace and device as
the reference array.
Parameters
----------
reference : array
Reference array.
*arrays : array
Arrays to check.
Returns
-------
arrays : list
Arrays with the same namespace and device as reference.
"""
xp, is_array_api = get_namespace(reference)
if is_array_api:
device_ = device(reference)
# Move arrays to the same namespace and device as the reference array.
return [xp.asarray(a, device=device_) for a in arrays]
else:
return arrays
class _ArrayAPIWrapper:
"""sklearn specific Array API compatibility wrapper
This wrapper makes it possible for scikit-learn maintainers to
deal with discrepancies between different implementations of the
Python Array API standard and its evolution over time.
The Python Array API standard specification:
https://data-apis.org/array-api/latest/
Documentation of the NumPy implementation:
https://numpy.org/neps/nep-0047-array-api-standard.html
"""
def __init__(self, array_namespace):
self._namespace = array_namespace
def __getattr__(self, name):
return getattr(self._namespace, name)
def __eq__(self, other):
return self._namespace == other._namespace
def isdtype(self, dtype, kind):
return isdtype(dtype, kind, xp=self._namespace)
def _check_device_cpu(device): # noqa
if device not in {"cpu", None}:
raise ValueError(f"Unsupported device for NumPy: {device!r}")
def _accept_device_cpu(func):
@wraps(func)
def wrapped_func(*args, **kwargs):
_check_device_cpu(kwargs.pop("device", None))
return func(*args, **kwargs)
return wrapped_func
class _NumPyAPIWrapper:
"""Array API compat wrapper for any numpy version
NumPy < 2 does not implement the namespace. NumPy 2 and later should
progressively implement more an more of the latest Array API spec but this
is still work in progress at this time.
This wrapper makes it possible to write code that uses the standard Array
API while working with any version of NumPy supported by scikit-learn.
See the `get_namespace()` public function for more details.
"""
# TODO: once scikit-learn drops support for NumPy < 2, this class can be
# removed, assuming Array API compliance of NumPy 2 is actually sufficient
# for scikit-learn's needs.
# Creation functions in spec:
# https://data-apis.org/array-api/latest/API_specification/creation_functions.html
_CREATION_FUNCS = {
"arange",
"empty",
"empty_like",
"eye",
"full",
"full_like",
"linspace",
"ones",
"ones_like",
"zeros",
"zeros_like",
}
# Data types in spec
# https://data-apis.org/array-api/latest/API_specification/data_types.html
_DTYPES = {
"int8",
"int16",
"int32",
"int64",
"uint8",
"uint16",
"uint32",
"uint64",
# XXX: float16 is not part of the Array API spec but exposed by
# some namespaces.
"float16",
"float32",
"float64",
"complex64",
"complex128",
}
def __getattr__(self, name):
attr = getattr(numpy, name)
# Support device kwargs and make sure they are on the CPU
if name in self._CREATION_FUNCS:
return _accept_device_cpu(attr)
# Convert to dtype objects
if name in self._DTYPES:
return numpy.dtype(attr)
return attr
@property
def bool(self):
return numpy.bool_
def astype(self, x, dtype, *, copy=True, casting="unsafe"):
# astype is not defined in the top level NumPy namespace
return x.astype(dtype, copy=copy, casting=casting)
def asarray(self, x, *, dtype=None, device=None, copy=None): # noqa
_check_device_cpu(device)
# Support copy in NumPy namespace
if copy is True:
return numpy.array(x, copy=True, dtype=dtype)
else:
return numpy.asarray(x, dtype=dtype)
def unique_inverse(self, x):
return numpy.unique(x, return_inverse=True)
def unique_counts(self, x):
return numpy.unique(x, return_counts=True)
def unique_values(self, x):
return numpy.unique(x)
def concat(self, arrays, *, axis=None):
return numpy.concatenate(arrays, axis=axis)
def reshape(self, x, shape, *, copy=None):
"""Gives a new shape to an array without changing its data.
The Array API specification requires shape to be a tuple.
https://data-apis.org/array-api/latest/API_specification/generated/array_api.reshape.html
"""
if not isinstance(shape, tuple):
raise TypeError(
f"shape must be a tuple, got {shape!r} of type {type(shape)}"
)
if copy is True:
x = x.copy()
return numpy.reshape(x, shape)
def isdtype(self, dtype, kind):
return isdtype(dtype, kind, xp=self)
_NUMPY_API_WRAPPER_INSTANCE = _NumPyAPIWrapper()
def _remove_non_arrays(*arrays, remove_none=True, remove_types=(str,)):
"""Filter arrays to exclude None and/or specific types.
Raise ValueError if no arrays are left after filtering.
Parameters
----------
*arrays : array objects
Array objects.
remove_none : bool, default=True
Whether to ignore None objects passed in arrays.
remove_types : tuple or list, default=(str,)
Types to ignore in the arrays.
Returns
-------
filtered_arrays : list
List of arrays with None and typoe
"""
filtered_arrays = []
remove_types = tuple(remove_types)
for array in arrays:
if remove_none and array is None:
continue
if isinstance(array, remove_types):
continue
filtered_arrays.append(array)
if not filtered_arrays:
raise ValueError(
f"At least one input array expected after filtering with {remove_none=}, "
f"remove_types=[{', '.join(t.__name__ for t in remove_types)}]. Got none. "
f"Original types: [{', '.join(type(a).__name__ for a in arrays)}]."
)
return filtered_arrays
def get_namespace(*arrays, remove_none=True, remove_types=(str,), xp=None):
"""Get namespace of arrays.
Introspect `arrays` arguments and return their common Array API compatible
namespace object, if any.
See: https://numpy.org/neps/nep-0047-array-api-standard.html
If `arrays` are regular numpy arrays, an instance of the `_NumPyAPIWrapper`
compatibility wrapper is returned instead.
Namespace support is not enabled by default. To enabled it call:
sklearn.set_config(array_api_dispatch=True)
or:
with sklearn.config_context(array_api_dispatch=True):
# your code here
Otherwise an instance of the `_NumPyAPIWrapper` compatibility wrapper is
always returned irrespective of the fact that arrays implement the
`__array_namespace__` protocol or not.
Parameters
----------
*arrays : array objects
Array objects.
remove_none : bool, default=True
Whether to ignore None objects passed in arrays.
remove_types : tuple or list, default=(str,)
Types to ignore in the arrays.
xp : module, default=None
Precomputed array namespace module. When passed, typically from a caller
that has already performed inspection of its own inputs, skips array
namespace inspection.
Returns
-------
namespace : module
Namespace shared by array objects. If any of the `arrays` are not arrays,
the namespace defaults to NumPy.
is_array_api_compliant : bool
True if the arrays are containers that implement the Array API spec.
Always False when array_api_dispatch=False.
"""
array_api_dispatch = get_config()["array_api_dispatch"]
if not array_api_dispatch:
if xp is not None:
return xp, False
else:
return _NUMPY_API_WRAPPER_INSTANCE, False
if xp is not None:
return xp, True
arrays = _remove_non_arrays(
*arrays, remove_none=remove_none, remove_types=remove_types
)
_check_array_api_dispatch(array_api_dispatch)
# array-api-compat is a required dependency of scikit-learn only when
# configuring `array_api_dispatch=True`. Its import should therefore be
# protected by _check_array_api_dispatch to display an informative error
# message in case it is missing.
import array_api_compat
namespace, is_array_api_compliant = array_api_compat.get_namespace(*arrays), True
# These namespaces need additional wrapping to smooth out small differences
# between implementations
if namespace.__name__ in {"cupy.array_api"}:
namespace = _ArrayAPIWrapper(namespace)
return namespace, is_array_api_compliant
def get_namespace_and_device(*array_list, remove_none=True, remove_types=(str,)):
"""Combination into one single function of `get_namespace` and `device`."""
array_list = _remove_non_arrays(
*array_list, remove_none=remove_none, remove_types=remove_types
)
skip_remove_kwargs = dict(remove_none=False, remove_types=[])
return (
*get_namespace(*array_list, **skip_remove_kwargs),
device(*array_list, **skip_remove_kwargs),
)
def _expit(X, xp=None):
xp, _ = get_namespace(X, xp=xp)
if _is_numpy_namespace(xp):
return xp.asarray(special.expit(numpy.asarray(X)))
return 1.0 / (1.0 + xp.exp(-X))
def _add_to_diagonal(array, value, xp):
# Workaround for the lack of support for xp.reshape(a, shape, copy=False) in
# numpy.array_api: https://github.com/numpy/numpy/issues/23410
value = xp.asarray(value, dtype=array.dtype)
if _is_numpy_namespace(xp):
array_np = numpy.asarray(array)
array_np.flat[:: array.shape[0] + 1] += value
return xp.asarray(array_np)
elif value.ndim == 1:
for i in range(array.shape[0]):
array[i, i] += value[i]
else:
# scalar value
for i in range(array.shape[0]):
array[i, i] += value
def _find_matching_floating_dtype(*arrays, xp):
"""Find a suitable floating point dtype when computing with arrays.
If any of the arrays are floating point, return the dtype with the highest
precision by following official type promotion rules:
https://data-apis.org/array-api/latest/API_specification/type_promotion.html
If there are no floating point input arrays (all integral inputs for
instance), return the default floating point dtype for the namespace.
"""
dtyped_arrays = [a for a in arrays if hasattr(a, "dtype")]
floating_dtypes = [
a.dtype for a in dtyped_arrays if xp.isdtype(a.dtype, "real floating")
]
if floating_dtypes:
# Return the floating dtype with the highest precision:
return xp.result_type(*floating_dtypes)
# If none of the input arrays have a floating point dtype, they must be all
# integer arrays or containers of Python scalars: return the default
# floating point dtype for the namespace (implementation specific).
return xp.asarray(0.0).dtype
def _average(a, axis=None, weights=None, normalize=True, xp=None):
"""Partial port of np.average to support the Array API.
It does a best effort at mimicking the return dtype rule described at
https://numpy.org/doc/stable/reference/generated/numpy.average.html but
only for the common cases needed in scikit-learn.
"""
xp, _, device_ = get_namespace_and_device(a, weights)
if _is_numpy_namespace(xp):
if normalize:
return xp.asarray(numpy.average(a, axis=axis, weights=weights))
elif axis is None and weights is not None:
return xp.asarray(numpy.dot(a, weights))
a = xp.asarray(a, device=device_)
if weights is not None:
weights = xp.asarray(weights, device=device_)
if weights is not None and a.shape != weights.shape:
if axis is None:
raise TypeError(
f"Axis must be specified when the shape of a {tuple(a.shape)} and "
f"weights {tuple(weights.shape)} differ."
)
if weights.ndim != 1:
raise TypeError(
f"1D weights expected when a.shape={tuple(a.shape)} and "
f"weights.shape={tuple(weights.shape)} differ."
)
if size(weights) != a.shape[axis]:
raise ValueError(
f"Length of weights {size(weights)} not compatible with "
f" a.shape={tuple(a.shape)} and {axis=}."
)
# If weights are 1D, add singleton dimensions for broadcasting
shape = [1] * a.ndim
shape[axis] = a.shape[axis]
weights = xp.reshape(weights, shape)
if xp.isdtype(a.dtype, "complex floating"):
raise NotImplementedError(
"Complex floating point values are not supported by average."
)
if weights is not None and xp.isdtype(weights.dtype, "complex floating"):
raise NotImplementedError(
"Complex floating point values are not supported by average."
)
output_dtype = _find_matching_floating_dtype(a, weights, xp=xp)
a = xp.astype(a, output_dtype)
if weights is None:
return (xp.mean if normalize else xp.sum)(a, axis=axis)
weights = xp.astype(weights, output_dtype)
sum_ = xp.sum(xp.multiply(a, weights), axis=axis)
if not normalize:
return sum_
scale = xp.sum(weights, axis=axis)
if xp.any(scale == 0.0):
raise ZeroDivisionError("Weights sum to zero, can't be normalized")
return sum_ / scale
def _nanmin(X, axis=None, xp=None):
# TODO: refactor once nan-aware reductions are standardized:
# https://github.com/data-apis/array-api/issues/621
xp, _ = get_namespace(X, xp=xp)
if _is_numpy_namespace(xp):
return xp.asarray(numpy.nanmin(X, axis=axis))
else:
mask = xp.isnan(X)
X = xp.min(xp.where(mask, xp.asarray(+xp.inf, device=device(X)), X), axis=axis)
# Replace Infs from all NaN slices with NaN again
mask = xp.all(mask, axis=axis)
if xp.any(mask):
X = xp.where(mask, xp.asarray(xp.nan), X)
return X
def _nanmax(X, axis=None, xp=None):
# TODO: refactor once nan-aware reductions are standardized:
# https://github.com/data-apis/array-api/issues/621
xp, _ = get_namespace(X, xp=xp)
if _is_numpy_namespace(xp):
return xp.asarray(numpy.nanmax(X, axis=axis))
else:
mask = xp.isnan(X)
X = xp.max(xp.where(mask, xp.asarray(-xp.inf, device=device(X)), X), axis=axis)
# Replace Infs from all NaN slices with NaN again
mask = xp.all(mask, axis=axis)
if xp.any(mask):
X = xp.where(mask, xp.asarray(xp.nan), X)
return X
def _asarray_with_order(
array, dtype=None, order=None, copy=None, *, xp=None, device=None
):
"""Helper to support the order kwarg only for NumPy-backed arrays
Memory layout parameter `order` is not exposed in the Array API standard,
however some input validation code in scikit-learn needs to work both
for classes and functions that will leverage Array API only operations
and for code that inherently relies on NumPy backed data containers with
specific memory layout constraints (e.g. our own Cython code). The
purpose of this helper is to make it possible to share code for data
container validation without memory copies for both downstream use cases:
the `order` parameter is only enforced if the input array implementation
is NumPy based, otherwise `order` is just silently ignored.
"""
xp, _ = get_namespace(array, xp=xp)
if _is_numpy_namespace(xp):
# Use NumPy API to support order
if copy is True:
array = numpy.array(array, order=order, dtype=dtype)
else:
array = numpy.asarray(array, order=order, dtype=dtype)
# At this point array is a NumPy ndarray. We convert it to an array
# container that is consistent with the input's namespace.
return xp.asarray(array)
else:
return xp.asarray(array, dtype=dtype, copy=copy, device=device)
def _ravel(array, xp=None):
"""Array API compliant version of np.ravel.
For non numpy namespaces, it just returns a flattened array, that might
be or not be a copy.
"""
xp, _ = get_namespace(array, xp=xp)
if _is_numpy_namespace(xp):
array = numpy.asarray(array)
return xp.asarray(numpy.ravel(array, order="C"))
return xp.reshape(array, shape=(-1,))
def _convert_to_numpy(array, xp):
"""Convert X into a NumPy ndarray on the CPU."""
xp_name = xp.__name__
if xp_name in {"array_api_compat.torch", "torch"}:
return array.cpu().numpy()
elif xp_name == "cupy.array_api":
return array._array.get()
elif xp_name in {"array_api_compat.cupy", "cupy"}: # pragma: nocover
return array.get()
return numpy.asarray(array)
def _estimator_with_converted_arrays(estimator, converter):
"""Create new estimator which converting all attributes that are arrays.
The converter is called on all NumPy arrays and arrays that support the
`DLPack interface <https://dmlc.github.io/dlpack/latest/>`__.
Parameters
----------
estimator : Estimator
Estimator to convert
converter : callable
Callable that takes an array attribute and returns the converted array.
Returns
-------
new_estimator : Estimator
Convert estimator
"""
from sklearn.base import clone
new_estimator = clone(estimator)
for key, attribute in vars(estimator).items():
if hasattr(attribute, "__dlpack__") or isinstance(attribute, numpy.ndarray):
attribute = converter(attribute)
setattr(new_estimator, key, attribute)
return new_estimator
def _atol_for_type(dtype):
"""Return the absolute tolerance for a given numpy dtype."""
return numpy.finfo(dtype).eps * 100
def indexing_dtype(xp):
"""Return a platform-specific integer dtype suitable for indexing.
On 32-bit platforms, this will typically return int32 and int64 otherwise.
Note: using dtype is recommended for indexing transient array
datastructures. For long-lived arrays, such as the fitted attributes of
estimators, it is instead recommended to use platform-independent int32 if
we do not expect to index more 2B elements. Using fixed dtypes simplifies
the handling of serialized models, e.g. to deploy a model fit on a 64-bit
platform to a target 32-bit platform such as WASM/pyodide.
"""
# Currently this is implemented with simple hack that assumes that
# following "may be" statements in the Array API spec always hold:
# > The default integer data type should be the same across platforms, but
# > the default may vary depending on whether Python is 32-bit or 64-bit.
# > The default array index data type may be int32 on 32-bit platforms, but
# > the default should be int64 otherwise.
# https://data-apis.org/array-api/latest/API_specification/data_types.html#default-data-types
# TODO: once sufficiently adopted, we might want to instead rely on the
# newer inspection API: https://github.com/data-apis/array-api/issues/640
return xp.asarray(0).dtype
@@ -0,0 +1,93 @@
from functools import update_wrapper, wraps
from types import MethodType
class _AvailableIfDescriptor:
"""Implements a conditional property using the descriptor protocol.
Using this class to create a decorator will raise an ``AttributeError``
if check(self) returns a falsey value. Note that if check raises an error
this will also result in hasattr returning false.
See https://docs.python.org/3/howto/descriptor.html for an explanation of
descriptors.
"""
def __init__(self, fn, check, attribute_name):
self.fn = fn
self.check = check
self.attribute_name = attribute_name
# update the docstring of the descriptor
update_wrapper(self, fn)
def _check(self, obj, owner):
attr_err_msg = (
f"This {repr(owner.__name__)} has no attribute {repr(self.attribute_name)}"
)
try:
check_result = self.check(obj)
except Exception as e:
raise AttributeError(attr_err_msg) from e
if not check_result:
raise AttributeError(attr_err_msg)
def __get__(self, obj, owner=None):
if obj is not None:
# delegate only on instances, not the classes.
# this is to allow access to the docstrings.
self._check(obj, owner=owner)
out = MethodType(self.fn, obj)
else:
# This makes it possible to use the decorated method as an unbound method,
# for instance when monkeypatching.
@wraps(self.fn)
def out(*args, **kwargs):
self._check(args[0], owner=owner)
return self.fn(*args, **kwargs)
return out
def available_if(check):
"""An attribute that is available only if check returns a truthy value.
Parameters
----------
check : callable
When passed the object with the decorated method, this should return
a truthy value if the attribute is available, and either return False
or raise an AttributeError if not available.
Returns
-------
callable
Callable makes the decorated method available if `check` returns
a truthy value, otherwise the decorated method is unavailable.
Examples
--------
>>> from sklearn.utils.metaestimators import available_if
>>> class HelloIfEven:
... def __init__(self, x):
... self.x = x
...
... def _x_is_even(self):
... return self.x % 2 == 0
...
... @available_if(_x_is_even)
... def say_hello(self):
... print("Hello")
...
>>> obj = HelloIfEven(1)
>>> hasattr(obj, "say_hello")
False
>>> obj.x = 2
>>> hasattr(obj, "say_hello")
True
>>> obj.say_hello()
Hello
"""
return lambda fn: _AvailableIfDescriptor(fn, check, attribute_name=fn.__name__)
@@ -0,0 +1,67 @@
import warnings
class Bunch(dict):
"""Container object exposing keys as attributes.
Bunch objects are sometimes used as an output for functions and methods.
They extend dictionaries by enabling values to be accessed by key,
`bunch["value_key"]`, or by an attribute, `bunch.value_key`.
Examples
--------
>>> from sklearn.utils import Bunch
>>> b = Bunch(a=1, b=2)
>>> b['b']
2
>>> b.b
2
>>> b.a = 3
>>> b['a']
3
>>> b.c = 6
>>> b['c']
6
"""
def __init__(self, **kwargs):
super().__init__(kwargs)
# Map from deprecated key to warning message
self.__dict__["_deprecated_key_to_warnings"] = {}
def __getitem__(self, key):
if key in self.__dict__.get("_deprecated_key_to_warnings", {}):
warnings.warn(
self._deprecated_key_to_warnings[key],
FutureWarning,
)
return super().__getitem__(key)
def _set_deprecated(self, value, *, new_key, deprecated_key, warning_message):
"""Set key in dictionary to be deprecated with its warning message."""
self.__dict__["_deprecated_key_to_warnings"][deprecated_key] = warning_message
self[new_key] = self[deprecated_key] = value
def __setattr__(self, key, value):
self[key] = value
def __dir__(self):
return self.keys()
def __getattr__(self, key):
try:
return self[key]
except KeyError:
raise AttributeError(key)
def __setstate__(self, state):
# Bunch pickles generated with scikit-learn 0.16.* have an non
# empty __dict__. This causes a surprising behaviour when
# loading these pickles scikit-learn 0.17: reading bunch.key
# uses __dict__ but assigning to bunch.key use __setattr__ and
# only changes bunch['key']. More details can be found at:
# https://github.com/scikit-learn/scikit-learn/issues/6196.
# Overriding __setstate__ to be a noop has the effect of
# ignoring the pickled __dict__
pass
@@ -0,0 +1,175 @@
import warnings
from itertools import islice
from numbers import Integral
import numpy as np
from .._config import get_config
from ._param_validation import Interval, validate_params
def chunk_generator(gen, chunksize):
"""Chunk generator, ``gen`` into lists of length ``chunksize``. The last
chunk may have a length less than ``chunksize``."""
while True:
chunk = list(islice(gen, chunksize))
if chunk:
yield chunk
else:
return
@validate_params(
{
"n": [Interval(Integral, 1, None, closed="left")],
"batch_size": [Interval(Integral, 1, None, closed="left")],
"min_batch_size": [Interval(Integral, 0, None, closed="left")],
},
prefer_skip_nested_validation=True,
)
def gen_batches(n, batch_size, *, min_batch_size=0):
"""Generator to create slices containing `batch_size` elements from 0 to `n`.
The last slice may contain less than `batch_size` elements, when
`batch_size` does not divide `n`.
Parameters
----------
n : int
Size of the sequence.
batch_size : int
Number of elements in each batch.
min_batch_size : int, default=0
Minimum number of elements in each batch.
Yields
------
slice of `batch_size` elements
See Also
--------
gen_even_slices: Generator to create n_packs slices going up to n.
Examples
--------
>>> from sklearn.utils import gen_batches
>>> list(gen_batches(7, 3))
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
>>> list(gen_batches(6, 3))
[slice(0, 3, None), slice(3, 6, None)]
>>> list(gen_batches(2, 3))
[slice(0, 2, None)]
>>> list(gen_batches(7, 3, min_batch_size=0))
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
>>> list(gen_batches(7, 3, min_batch_size=2))
[slice(0, 3, None), slice(3, 7, None)]
"""
start = 0
for _ in range(int(n // batch_size)):
end = start + batch_size
if end + min_batch_size > n:
continue
yield slice(start, end)
start = end
if start < n:
yield slice(start, n)
@validate_params(
{
"n": [Interval(Integral, 1, None, closed="left")],
"n_packs": [Interval(Integral, 1, None, closed="left")],
"n_samples": [Interval(Integral, 1, None, closed="left"), None],
},
prefer_skip_nested_validation=True,
)
def gen_even_slices(n, n_packs, *, n_samples=None):
"""Generator to create `n_packs` evenly spaced slices going up to `n`.
If `n_packs` does not divide `n`, except for the first `n % n_packs`
slices, remaining slices may contain fewer elements.
Parameters
----------
n : int
Size of the sequence.
n_packs : int
Number of slices to generate.
n_samples : int, default=None
Number of samples. Pass `n_samples` when the slices are to be used for
sparse matrix indexing; slicing off-the-end raises an exception, while
it works for NumPy arrays.
Yields
------
`slice` representing a set of indices from 0 to n.
See Also
--------
gen_batches: Generator to create slices containing batch_size elements
from 0 to n.
Examples
--------
>>> from sklearn.utils import gen_even_slices
>>> list(gen_even_slices(10, 1))
[slice(0, 10, None)]
>>> list(gen_even_slices(10, 10))
[slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
>>> list(gen_even_slices(10, 5))
[slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
>>> list(gen_even_slices(10, 3))
[slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
"""
start = 0
for pack_num in range(n_packs):
this_n = n // n_packs
if pack_num < n % n_packs:
this_n += 1
if this_n > 0:
end = start + this_n
if n_samples is not None:
end = min(n_samples, end)
yield slice(start, end, None)
start = end
def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
"""Calculate how many rows can be processed within `working_memory`.
Parameters
----------
row_bytes : int
The expected number of bytes of memory that will be consumed
during the processing of each row.
max_n_rows : int, default=None
The maximum return value.
working_memory : int or float, default=None
The number of rows to fit inside this number of MiB will be
returned. When None (default), the value of
``sklearn.get_config()['working_memory']`` is used.
Returns
-------
int
The number of rows which can be processed within `working_memory`.
Warns
-----
Issues a UserWarning if `row_bytes exceeds `working_memory` MiB.
"""
if working_memory is None:
working_memory = get_config()["working_memory"]
chunk_n_rows = int(working_memory * (2**20) // row_bytes)
if max_n_rows is not None:
chunk_n_rows = min(chunk_n_rows, max_n_rows)
if chunk_n_rows < 1:
warnings.warn(
"Could not adhere to working_memory config. "
"Currently %.0fMiB, %.0fMiB required."
% (working_memory, np.ceil(row_bytes * 2**-20))
)
chunk_n_rows = 1
return chunk_n_rows
@@ -0,0 +1,41 @@
from cython cimport floating
cpdef enum BLAS_Order:
RowMajor # C contiguous
ColMajor # Fortran contiguous
cpdef enum BLAS_Trans:
NoTrans = 110 # correspond to 'n'
Trans = 116 # correspond to 't'
# BLAS Level 1 ################################################################
cdef floating _dot(int, const floating*, int, const floating*, int) noexcept nogil
cdef floating _asum(int, const floating*, int) noexcept nogil
cdef void _axpy(int, floating, const floating*, int, floating*, int) noexcept nogil
cdef floating _nrm2(int, const floating*, int) noexcept nogil
cdef void _copy(int, const floating*, int, const floating*, int) noexcept nogil
cdef void _scal(int, floating, const floating*, int) noexcept nogil
cdef void _rotg(floating*, floating*, floating*, floating*) noexcept nogil
cdef void _rot(int, floating*, int, floating*, int, floating, floating) noexcept nogil
# BLAS Level 2 ################################################################
cdef void _gemv(BLAS_Order, BLAS_Trans, int, int, floating, const floating*, int,
const floating*, int, floating, floating*, int) noexcept nogil
cdef void _ger(BLAS_Order, int, int, floating, const floating*, int, const floating*,
int, floating*, int) noexcept nogil
# BLASLevel 3 ################################################################
cdef void _gemm(BLAS_Order, BLAS_Trans, BLAS_Trans, int, int, int, floating,
const floating*, int, const floating*, int, floating, floating*,
int) noexcept nogil
@@ -0,0 +1,233 @@
from cython cimport floating
from scipy.linalg.cython_blas cimport sdot, ddot
from scipy.linalg.cython_blas cimport sasum, dasum
from scipy.linalg.cython_blas cimport saxpy, daxpy
from scipy.linalg.cython_blas cimport snrm2, dnrm2
from scipy.linalg.cython_blas cimport scopy, dcopy
from scipy.linalg.cython_blas cimport sscal, dscal
from scipy.linalg.cython_blas cimport srotg, drotg
from scipy.linalg.cython_blas cimport srot, drot
from scipy.linalg.cython_blas cimport sgemv, dgemv
from scipy.linalg.cython_blas cimport sger, dger
from scipy.linalg.cython_blas cimport sgemm, dgemm
################
# BLAS Level 1 #
################
cdef floating _dot(int n, const floating *x, int incx,
const floating *y, int incy) noexcept nogil:
"""x.T.y"""
if floating is float:
return sdot(&n, <float *> x, &incx, <float *> y, &incy)
else:
return ddot(&n, <double *> x, &incx, <double *> y, &incy)
cpdef _dot_memview(const floating[::1] x, const floating[::1] y):
return _dot(x.shape[0], &x[0], 1, &y[0], 1)
cdef floating _asum(int n, const floating *x, int incx) noexcept nogil:
"""sum(|x_i|)"""
if floating is float:
return sasum(&n, <float *> x, &incx)
else:
return dasum(&n, <double *> x, &incx)
cpdef _asum_memview(const floating[::1] x):
return _asum(x.shape[0], &x[0], 1)
cdef void _axpy(int n, floating alpha, const floating *x, int incx,
floating *y, int incy) noexcept nogil:
"""y := alpha * x + y"""
if floating is float:
saxpy(&n, &alpha, <float *> x, &incx, y, &incy)
else:
daxpy(&n, &alpha, <double *> x, &incx, y, &incy)
cpdef _axpy_memview(floating alpha, const floating[::1] x, floating[::1] y):
_axpy(x.shape[0], alpha, &x[0], 1, &y[0], 1)
cdef floating _nrm2(int n, const floating *x, int incx) noexcept nogil:
"""sqrt(sum((x_i)^2))"""
if floating is float:
return snrm2(&n, <float *> x, &incx)
else:
return dnrm2(&n, <double *> x, &incx)
cpdef _nrm2_memview(const floating[::1] x):
return _nrm2(x.shape[0], &x[0], 1)
cdef void _copy(int n, const floating *x, int incx, const floating *y, int incy) noexcept nogil:
"""y := x"""
if floating is float:
scopy(&n, <float *> x, &incx, <float *> y, &incy)
else:
dcopy(&n, <double *> x, &incx, <double *> y, &incy)
cpdef _copy_memview(const floating[::1] x, const floating[::1] y):
_copy(x.shape[0], &x[0], 1, &y[0], 1)
cdef void _scal(int n, floating alpha, const floating *x, int incx) noexcept nogil:
"""x := alpha * x"""
if floating is float:
sscal(&n, &alpha, <float *> x, &incx)
else:
dscal(&n, &alpha, <double *> x, &incx)
cpdef _scal_memview(floating alpha, const floating[::1] x):
_scal(x.shape[0], alpha, &x[0], 1)
cdef void _rotg(floating *a, floating *b, floating *c, floating *s) noexcept nogil:
"""Generate plane rotation"""
if floating is float:
srotg(a, b, c, s)
else:
drotg(a, b, c, s)
cpdef _rotg_memview(floating a, floating b, floating c, floating s):
_rotg(&a, &b, &c, &s)
return a, b, c, s
cdef void _rot(int n, floating *x, int incx, floating *y, int incy,
floating c, floating s) noexcept nogil:
"""Apply plane rotation"""
if floating is float:
srot(&n, x, &incx, y, &incy, &c, &s)
else:
drot(&n, x, &incx, y, &incy, &c, &s)
cpdef _rot_memview(floating[::1] x, floating[::1] y, floating c, floating s):
_rot(x.shape[0], &x[0], 1, &y[0], 1, c, s)
################
# BLAS Level 2 #
################
cdef void _gemv(BLAS_Order order, BLAS_Trans ta, int m, int n, floating alpha,
const floating *A, int lda, const floating *x, int incx,
floating beta, floating *y, int incy) noexcept nogil:
"""y := alpha * op(A).x + beta * y"""
cdef char ta_ = ta
if order == RowMajor:
ta_ = NoTrans if ta == Trans else Trans
if floating is float:
sgemv(&ta_, &n, &m, &alpha, <float *> A, &lda, <float *> x,
&incx, &beta, y, &incy)
else:
dgemv(&ta_, &n, &m, &alpha, <double *> A, &lda, <double *> x,
&incx, &beta, y, &incy)
else:
if floating is float:
sgemv(&ta_, &m, &n, &alpha, <float *> A, &lda, <float *> x,
&incx, &beta, y, &incy)
else:
dgemv(&ta_, &m, &n, &alpha, <double *> A, &lda, <double *> x,
&incx, &beta, y, &incy)
cpdef _gemv_memview(BLAS_Trans ta, floating alpha, const floating[:, :] A,
const floating[::1] x, floating beta, floating[::1] y):
cdef:
int m = A.shape[0]
int n = A.shape[1]
BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
int lda = m if order == ColMajor else n
_gemv(order, ta, m, n, alpha, &A[0, 0], lda, &x[0], 1, beta, &y[0], 1)
cdef void _ger(BLAS_Order order, int m, int n, floating alpha,
const floating *x, int incx, const floating *y,
int incy, floating *A, int lda) noexcept nogil:
"""A := alpha * x.y.T + A"""
if order == RowMajor:
if floating is float:
sger(&n, &m, &alpha, <float *> y, &incy, <float *> x, &incx, A, &lda)
else:
dger(&n, &m, &alpha, <double *> y, &incy, <double *> x, &incx, A, &lda)
else:
if floating is float:
sger(&m, &n, &alpha, <float *> x, &incx, <float *> y, &incy, A, &lda)
else:
dger(&m, &n, &alpha, <double *> x, &incx, <double *> y, &incy, A, &lda)
cpdef _ger_memview(floating alpha, const floating[::1] x,
const floating[::1] y, floating[:, :] A):
cdef:
int m = A.shape[0]
int n = A.shape[1]
BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
int lda = m if order == ColMajor else n
_ger(order, m, n, alpha, &x[0], 1, &y[0], 1, &A[0, 0], lda)
################
# BLAS Level 3 #
################
cdef void _gemm(BLAS_Order order, BLAS_Trans ta, BLAS_Trans tb, int m, int n,
int k, floating alpha, const floating *A, int lda, const floating *B,
int ldb, floating beta, floating *C, int ldc) noexcept nogil:
"""C := alpha * op(A).op(B) + beta * C"""
# TODO: Remove the pointer casts below once SciPy uses const-qualification.
# See: https://github.com/scipy/scipy/issues/14262
cdef:
char ta_ = ta
char tb_ = tb
if order == RowMajor:
if floating is float:
sgemm(&tb_, &ta_, &n, &m, &k, &alpha, <float*>B,
&ldb, <float*>A, &lda, &beta, C, &ldc)
else:
dgemm(&tb_, &ta_, &n, &m, &k, &alpha, <double*>B,
&ldb, <double*>A, &lda, &beta, C, &ldc)
else:
if floating is float:
sgemm(&ta_, &tb_, &m, &n, &k, &alpha, <float*>A,
&lda, <float*>B, &ldb, &beta, C, &ldc)
else:
dgemm(&ta_, &tb_, &m, &n, &k, &alpha, <double*>A,
&lda, <double*>B, &ldb, &beta, C, &ldc)
cpdef _gemm_memview(BLAS_Trans ta, BLAS_Trans tb, floating alpha,
const floating[:, :] A, const floating[:, :] B, floating beta,
floating[:, :] C):
cdef:
int m = A.shape[0] if ta == NoTrans else A.shape[1]
int n = B.shape[1] if tb == NoTrans else B.shape[0]
int k = A.shape[1] if ta == NoTrans else A.shape[0]
int lda, ldb, ldc
BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
if order == RowMajor:
lda = k if ta == NoTrans else m
ldb = n if tb == NoTrans else k
ldc = n
else:
lda = m if ta == NoTrans else k
ldb = k if tb == NoTrans else n
ldc = m
_gemm(order, ta, tb, m, n, k, alpha, &A[0, 0],
lda, &B[0, 0], ldb, beta, &C[0, 0], ldc)
@@ -0,0 +1,367 @@
from collections import Counter
from contextlib import suppress
from typing import NamedTuple
import numpy as np
from ._missing import is_scalar_nan
def _unique(values, *, return_inverse=False, return_counts=False):
"""Helper function to find unique values with support for python objects.
Uses pure python method for object dtype, and numpy method for
all other dtypes.
Parameters
----------
values : ndarray
Values to check for unknowns.
return_inverse : bool, default=False
If True, also return the indices of the unique values.
return_counts : bool, default=False
If True, also return the number of times each unique item appears in
values.
Returns
-------
unique : ndarray
The sorted unique values.
unique_inverse : ndarray
The indices to reconstruct the original array from the unique array.
Only provided if `return_inverse` is True.
unique_counts : ndarray
The number of times each of the unique values comes up in the original
array. Only provided if `return_counts` is True.
"""
if values.dtype == object:
return _unique_python(
values, return_inverse=return_inverse, return_counts=return_counts
)
# numerical
return _unique_np(
values, return_inverse=return_inverse, return_counts=return_counts
)
def _unique_np(values, return_inverse=False, return_counts=False):
"""Helper function to find unique values for numpy arrays that correctly
accounts for nans. See `_unique` documentation for details."""
uniques = np.unique(
values, return_inverse=return_inverse, return_counts=return_counts
)
inverse, counts = None, None
if return_counts:
*uniques, counts = uniques
if return_inverse:
*uniques, inverse = uniques
if return_counts or return_inverse:
uniques = uniques[0]
# np.unique will have duplicate missing values at the end of `uniques`
# here we clip the nans and remove it from uniques
if uniques.size and is_scalar_nan(uniques[-1]):
nan_idx = np.searchsorted(uniques, np.nan)
uniques = uniques[: nan_idx + 1]
if return_inverse:
inverse[inverse > nan_idx] = nan_idx
if return_counts:
counts[nan_idx] = np.sum(counts[nan_idx:])
counts = counts[: nan_idx + 1]
ret = (uniques,)
if return_inverse:
ret += (inverse,)
if return_counts:
ret += (counts,)
return ret[0] if len(ret) == 1 else ret
class MissingValues(NamedTuple):
"""Data class for missing data information"""
nan: bool
none: bool
def to_list(self):
"""Convert tuple to a list where None is always first."""
output = []
if self.none:
output.append(None)
if self.nan:
output.append(np.nan)
return output
def _extract_missing(values):
"""Extract missing values from `values`.
Parameters
----------
values: set
Set of values to extract missing from.
Returns
-------
output: set
Set with missing values extracted.
missing_values: MissingValues
Object with missing value information.
"""
missing_values_set = {
value for value in values if value is None or is_scalar_nan(value)
}
if not missing_values_set:
return values, MissingValues(nan=False, none=False)
if None in missing_values_set:
if len(missing_values_set) == 1:
output_missing_values = MissingValues(nan=False, none=True)
else:
# If there is more than one missing value, then it has to be
# float('nan') or np.nan
output_missing_values = MissingValues(nan=True, none=True)
else:
output_missing_values = MissingValues(nan=True, none=False)
# create set without the missing values
output = values - missing_values_set
return output, output_missing_values
class _nandict(dict):
"""Dictionary with support for nans."""
def __init__(self, mapping):
super().__init__(mapping)
for key, value in mapping.items():
if is_scalar_nan(key):
self.nan_value = value
break
def __missing__(self, key):
if hasattr(self, "nan_value") and is_scalar_nan(key):
return self.nan_value
raise KeyError(key)
def _map_to_integer(values, uniques):
"""Map values based on its position in uniques."""
table = _nandict({val: i for i, val in enumerate(uniques)})
return np.array([table[v] for v in values])
def _unique_python(values, *, return_inverse, return_counts):
# Only used in `_uniques`, see docstring there for details
try:
uniques_set = set(values)
uniques_set, missing_values = _extract_missing(uniques_set)
uniques = sorted(uniques_set)
uniques.extend(missing_values.to_list())
uniques = np.array(uniques, dtype=values.dtype)
except TypeError:
types = sorted(t.__qualname__ for t in set(type(v) for v in values))
raise TypeError(
"Encoders require their input argument must be uniformly "
f"strings or numbers. Got {types}"
)
ret = (uniques,)
if return_inverse:
ret += (_map_to_integer(values, uniques),)
if return_counts:
ret += (_get_counts(values, uniques),)
return ret[0] if len(ret) == 1 else ret
def _encode(values, *, uniques, check_unknown=True):
"""Helper function to encode values into [0, n_uniques - 1].
Uses pure python method for object dtype, and numpy method for
all other dtypes.
The numpy method has the limitation that the `uniques` need to
be sorted. Importantly, this is not checked but assumed to already be
the case. The calling method needs to ensure this for all non-object
values.
Parameters
----------
values : ndarray
Values to encode.
uniques : ndarray
The unique values in `values`. If the dtype is not object, then
`uniques` needs to be sorted.
check_unknown : bool, default=True
If True, check for values in `values` that are not in `unique`
and raise an error. This is ignored for object dtype, and treated as
True in this case. This parameter is useful for
_BaseEncoder._transform() to avoid calling _check_unknown()
twice.
Returns
-------
encoded : ndarray
Encoded values
"""
if values.dtype.kind in "OUS":
try:
return _map_to_integer(values, uniques)
except KeyError as e:
raise ValueError(f"y contains previously unseen labels: {str(e)}")
else:
if check_unknown:
diff = _check_unknown(values, uniques)
if diff:
raise ValueError(f"y contains previously unseen labels: {str(diff)}")
return np.searchsorted(uniques, values)
def _check_unknown(values, known_values, return_mask=False):
"""
Helper function to check for unknowns in values to be encoded.
Uses pure python method for object dtype, and numpy method for
all other dtypes.
Parameters
----------
values : array
Values to check for unknowns.
known_values : array
Known values. Must be unique.
return_mask : bool, default=False
If True, return a mask of the same shape as `values` indicating
the valid values.
Returns
-------
diff : list
The unique values present in `values` and not in `know_values`.
valid_mask : boolean array
Additionally returned if ``return_mask=True``.
"""
valid_mask = None
if values.dtype.kind in "OUS":
values_set = set(values)
values_set, missing_in_values = _extract_missing(values_set)
uniques_set = set(known_values)
uniques_set, missing_in_uniques = _extract_missing(uniques_set)
diff = values_set - uniques_set
nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan
none_in_diff = missing_in_values.none and not missing_in_uniques.none
def is_valid(value):
return (
value in uniques_set
or missing_in_uniques.none
and value is None
or missing_in_uniques.nan
and is_scalar_nan(value)
)
if return_mask:
if diff or nan_in_diff or none_in_diff:
valid_mask = np.array([is_valid(value) for value in values])
else:
valid_mask = np.ones(len(values), dtype=bool)
diff = list(diff)
if none_in_diff:
diff.append(None)
if nan_in_diff:
diff.append(np.nan)
else:
unique_values = np.unique(values)
diff = np.setdiff1d(unique_values, known_values, assume_unique=True)
if return_mask:
if diff.size:
valid_mask = np.isin(values, known_values)
else:
valid_mask = np.ones(len(values), dtype=bool)
# check for nans in the known_values
if np.isnan(known_values).any():
diff_is_nan = np.isnan(diff)
if diff_is_nan.any():
# removes nan from valid_mask
if diff.size and return_mask:
is_nan = np.isnan(values)
valid_mask[is_nan] = 1
# remove nan from diff
diff = diff[~diff_is_nan]
diff = list(diff)
if return_mask:
return diff, valid_mask
return diff
class _NaNCounter(Counter):
"""Counter with support for nan values."""
def __init__(self, items):
super().__init__(self._generate_items(items))
def _generate_items(self, items):
"""Generate items without nans. Stores the nan counts separately."""
for item in items:
if not is_scalar_nan(item):
yield item
continue
if not hasattr(self, "nan_count"):
self.nan_count = 0
self.nan_count += 1
def __missing__(self, key):
if hasattr(self, "nan_count") and is_scalar_nan(key):
return self.nan_count
raise KeyError(key)
def _get_counts(values, uniques):
"""Get the count of each of the `uniques` in `values`.
The counts will use the order passed in by `uniques`. For non-object dtypes,
`uniques` is assumed to be sorted and `np.nan` is at the end.
"""
if values.dtype.kind in "OU":
counter = _NaNCounter(values)
output = np.zeros(len(uniques), dtype=np.int64)
for i, item in enumerate(uniques):
with suppress(KeyError):
output[i] = counter[item]
return output
unique_values, counts = _unique_np(values, return_counts=True)
# Recorder unique_values based on input: `uniques`
uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]):
uniques_in_values[-1] = True
unique_valid_indices = np.searchsorted(unique_values, uniques[uniques_in_values])
output = np.zeros_like(uniques, dtype=np.int64)
output[uniques_in_values] = counts[unique_valid_indices]
return output
@@ -0,0 +1,404 @@
#$id {
/* Definition of color scheme common for light and dark mode */
--sklearn-color-text: black;
--sklearn-color-line: gray;
/* Definition of color scheme for unfitted estimators */
--sklearn-color-unfitted-level-0: #fff5e6;
--sklearn-color-unfitted-level-1: #f6e4d2;
--sklearn-color-unfitted-level-2: #ffe0b3;
--sklearn-color-unfitted-level-3: chocolate;
/* Definition of color scheme for fitted estimators */
--sklearn-color-fitted-level-0: #f0f8ff;
--sklearn-color-fitted-level-1: #d4ebff;
--sklearn-color-fitted-level-2: #b3dbfd;
--sklearn-color-fitted-level-3: cornflowerblue;
/* Specific color for light theme */
--sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
--sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));
--sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));
--sklearn-color-icon: #696969;
@media (prefers-color-scheme: dark) {
/* Redefinition of color scheme for dark theme */
--sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
--sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));
--sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));
--sklearn-color-icon: #878787;
}
}
#$id {
color: var(--sklearn-color-text);
}
#$id pre {
padding: 0;
}
#$id input.sk-hidden--visually {
border: 0;
clip: rect(1px 1px 1px 1px);
clip: rect(1px, 1px, 1px, 1px);
height: 1px;
margin: -1px;
overflow: hidden;
padding: 0;
position: absolute;
width: 1px;
}
#$id div.sk-dashed-wrapped {
border: 1px dashed var(--sklearn-color-line);
margin: 0 0.4em 0.5em 0.4em;
box-sizing: border-box;
padding-bottom: 0.4em;
background-color: var(--sklearn-color-background);
}
#$id div.sk-container {
/* jupyter's `normalize.less` sets `[hidden] { display: none; }`
but bootstrap.min.css set `[hidden] { display: none !important; }`
so we also need the `!important` here to be able to override the
default hidden behavior on the sphinx rendered scikit-learn.org.
See: https://github.com/scikit-learn/scikit-learn/issues/21755 */
display: inline-block !important;
position: relative;
}
#$id div.sk-text-repr-fallback {
display: none;
}
div.sk-parallel-item,
div.sk-serial,
div.sk-item {
/* draw centered vertical line to link estimators */
background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));
background-size: 2px 100%;
background-repeat: no-repeat;
background-position: center center;
}
/* Parallel-specific style estimator block */
#$id div.sk-parallel-item::after {
content: "";
width: 100%;
border-bottom: 2px solid var(--sklearn-color-text-on-default-background);
flex-grow: 1;
}
#$id div.sk-parallel {
display: flex;
align-items: stretch;
justify-content: center;
background-color: var(--sklearn-color-background);
position: relative;
}
#$id div.sk-parallel-item {
display: flex;
flex-direction: column;
}
#$id div.sk-parallel-item:first-child::after {
align-self: flex-end;
width: 50%;
}
#$id div.sk-parallel-item:last-child::after {
align-self: flex-start;
width: 50%;
}
#$id div.sk-parallel-item:only-child::after {
width: 0;
}
/* Serial-specific style estimator block */
#$id div.sk-serial {
display: flex;
flex-direction: column;
align-items: center;
background-color: var(--sklearn-color-background);
padding-right: 1em;
padding-left: 1em;
}
/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is
clickable and can be expanded/collapsed.
- Pipeline and ColumnTransformer use this feature and define the default style
- Estimators will overwrite some part of the style using the `sk-estimator` class
*/
/* Pipeline and ColumnTransformer style (default) */
#$id div.sk-toggleable {
/* Default theme specific background. It is overwritten whether we have a
specific estimator or a Pipeline/ColumnTransformer */
background-color: var(--sklearn-color-background);
}
/* Toggleable label */
#$id label.sk-toggleable__label {
cursor: pointer;
display: block;
width: 100%;
margin-bottom: 0;
padding: 0.5em;
box-sizing: border-box;
text-align: center;
}
#$id label.sk-toggleable__label-arrow:before {
/* Arrow on the left of the label */
content: "▸";
float: left;
margin-right: 0.25em;
color: var(--sklearn-color-icon);
}
#$id label.sk-toggleable__label-arrow:hover:before {
color: var(--sklearn-color-text);
}
/* Toggleable content - dropdown */
#$id div.sk-toggleable__content {
max-height: 0;
max-width: 0;
overflow: hidden;
text-align: left;
/* unfitted */
background-color: var(--sklearn-color-unfitted-level-0);
}
#$id div.sk-toggleable__content.fitted {
/* fitted */
background-color: var(--sklearn-color-fitted-level-0);
}
#$id div.sk-toggleable__content pre {
margin: 0.2em;
border-radius: 0.25em;
color: var(--sklearn-color-text);
/* unfitted */
background-color: var(--sklearn-color-unfitted-level-0);
}
#$id div.sk-toggleable__content.fitted pre {
/* unfitted */
background-color: var(--sklearn-color-fitted-level-0);
}
#$id input.sk-toggleable__control:checked~div.sk-toggleable__content {
/* Expand drop-down */
max-height: 200px;
max-width: 100%;
overflow: auto;
}
#$id input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {
content: "▾";
}
/* Pipeline/ColumnTransformer-specific style */
#$id div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
color: var(--sklearn-color-text);
background-color: var(--sklearn-color-unfitted-level-2);
}
#$id div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
background-color: var(--sklearn-color-fitted-level-2);
}
/* Estimator-specific style */
/* Colorize estimator box */
#$id div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
/* unfitted */
background-color: var(--sklearn-color-unfitted-level-2);
}
#$id div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {
/* fitted */
background-color: var(--sklearn-color-fitted-level-2);
}
#$id div.sk-label label.sk-toggleable__label,
#$id div.sk-label label {
/* The background is the default theme color */
color: var(--sklearn-color-text-on-default-background);
}
/* On hover, darken the color of the background */
#$id div.sk-label:hover label.sk-toggleable__label {
color: var(--sklearn-color-text);
background-color: var(--sklearn-color-unfitted-level-2);
}
/* Label box, darken color on hover, fitted */
#$id div.sk-label.fitted:hover label.sk-toggleable__label.fitted {
color: var(--sklearn-color-text);
background-color: var(--sklearn-color-fitted-level-2);
}
/* Estimator label */
#$id div.sk-label label {
font-family: monospace;
font-weight: bold;
display: inline-block;
line-height: 1.2em;
}
#$id div.sk-label-container {
text-align: center;
}
/* Estimator-specific */
#$id div.sk-estimator {
font-family: monospace;
border: 1px dotted var(--sklearn-color-border-box);
border-radius: 0.25em;
box-sizing: border-box;
margin-bottom: 0.5em;
/* unfitted */
background-color: var(--sklearn-color-unfitted-level-0);
}
#$id div.sk-estimator.fitted {
/* fitted */
background-color: var(--sklearn-color-fitted-level-0);
}
/* on hover */
#$id div.sk-estimator:hover {
/* unfitted */
background-color: var(--sklearn-color-unfitted-level-2);
}
#$id div.sk-estimator.fitted:hover {
/* fitted */
background-color: var(--sklearn-color-fitted-level-2);
}
/* Specification for estimator info (e.g. "i" and "?") */
/* Common style for "i" and "?" */
.sk-estimator-doc-link,
a:link.sk-estimator-doc-link,
a:visited.sk-estimator-doc-link {
float: right;
font-size: smaller;
line-height: 1em;
font-family: monospace;
background-color: var(--sklearn-color-background);
border-radius: 1em;
height: 1em;
width: 1em;
text-decoration: none !important;
margin-left: 1ex;
/* unfitted */
border: var(--sklearn-color-unfitted-level-1) 1pt solid;
color: var(--sklearn-color-unfitted-level-1);
}
.sk-estimator-doc-link.fitted,
a:link.sk-estimator-doc-link.fitted,
a:visited.sk-estimator-doc-link.fitted {
/* fitted */
border: var(--sklearn-color-fitted-level-1) 1pt solid;
color: var(--sklearn-color-fitted-level-1);
}
/* On hover */
div.sk-estimator:hover .sk-estimator-doc-link:hover,
.sk-estimator-doc-link:hover,
div.sk-label-container:hover .sk-estimator-doc-link:hover,
.sk-estimator-doc-link:hover {
/* unfitted */
background-color: var(--sklearn-color-unfitted-level-3);
color: var(--sklearn-color-background);
text-decoration: none;
}
div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,
.sk-estimator-doc-link.fitted:hover,
div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,
.sk-estimator-doc-link.fitted:hover {
/* fitted */
background-color: var(--sklearn-color-fitted-level-3);
color: var(--sklearn-color-background);
text-decoration: none;
}
/* Span, style for the box shown on hovering the info icon */
.sk-estimator-doc-link span {
display: none;
z-index: 9999;
position: relative;
font-weight: normal;
right: .2ex;
padding: .5ex;
margin: .5ex;
width: min-content;
min-width: 20ex;
max-width: 50ex;
color: var(--sklearn-color-text);
box-shadow: 2pt 2pt 4pt #999;
/* unfitted */
background: var(--sklearn-color-unfitted-level-0);
border: .5pt solid var(--sklearn-color-unfitted-level-3);
}
.sk-estimator-doc-link.fitted span {
/* fitted */
background: var(--sklearn-color-fitted-level-0);
border: var(--sklearn-color-fitted-level-3);
}
.sk-estimator-doc-link:hover span {
display: block;
}
/* "?"-specific style due to the `<a>` HTML tag */
#$id a.estimator_doc_link {
float: right;
font-size: 1rem;
line-height: 1em;
font-family: monospace;
background-color: var(--sklearn-color-background);
border-radius: 1rem;
height: 1rem;
width: 1rem;
text-decoration: none;
/* unfitted */
color: var(--sklearn-color-unfitted-level-1);
border: var(--sklearn-color-unfitted-level-1) 1pt solid;
}
#$id a.estimator_doc_link.fitted {
/* fitted */
border: var(--sklearn-color-fitted-level-1) 1pt solid;
color: var(--sklearn-color-fitted-level-1);
}
/* On hover */
#$id a.estimator_doc_link:hover {
/* unfitted */
background-color: var(--sklearn-color-unfitted-level-3);
color: var(--sklearn-color-background);
text-decoration: none;
}
#$id a.estimator_doc_link.fitted:hover {
/* fitted */
background-color: var(--sklearn-color-fitted-level-3);
}
@@ -0,0 +1,496 @@
import html
import itertools
from contextlib import closing
from inspect import isclass
from io import StringIO
from pathlib import Path
from string import Template
from .. import __version__, config_context
from .fixes import parse_version
class _IDCounter:
"""Generate sequential ids with a prefix."""
def __init__(self, prefix):
self.prefix = prefix
self.count = 0
def get_id(self):
self.count += 1
return f"{self.prefix}-{self.count}"
def _get_css_style():
return Path(__file__).with_suffix(".css").read_text(encoding="utf-8")
_CONTAINER_ID_COUNTER = _IDCounter("sk-container-id")
_ESTIMATOR_ID_COUNTER = _IDCounter("sk-estimator-id")
_CSS_STYLE = _get_css_style()
class _VisualBlock:
"""HTML Representation of Estimator
Parameters
----------
kind : {'serial', 'parallel', 'single'}
kind of HTML block
estimators : list of estimators or `_VisualBlock`s or a single estimator
If kind != 'single', then `estimators` is a list of
estimators.
If kind == 'single', then `estimators` is a single estimator.
names : list of str, default=None
If kind != 'single', then `names` corresponds to estimators.
If kind == 'single', then `names` is a single string corresponding to
the single estimator.
name_details : list of str, str, or None, default=None
If kind != 'single', then `name_details` corresponds to `names`.
If kind == 'single', then `name_details` is a single string
corresponding to the single estimator.
dash_wrapped : bool, default=True
If true, wrapped HTML element will be wrapped with a dashed border.
Only active when kind != 'single'.
"""
def __init__(
self, kind, estimators, *, names=None, name_details=None, dash_wrapped=True
):
self.kind = kind
self.estimators = estimators
self.dash_wrapped = dash_wrapped
if self.kind in ("parallel", "serial"):
if names is None:
names = (None,) * len(estimators)
if name_details is None:
name_details = (None,) * len(estimators)
self.names = names
self.name_details = name_details
def _sk_visual_block_(self):
return self
def _write_label_html(
out,
name,
name_details,
outer_class="sk-label-container",
inner_class="sk-label",
checked=False,
doc_link="",
is_fitted_css_class="",
is_fitted_icon="",
):
"""Write labeled html with or without a dropdown with named details.
Parameters
----------
out : file-like object
The file to write the HTML representation to.
name : str
The label for the estimator. It corresponds either to the estimator class name
for a simple estimator or in the case of a `Pipeline` and `ColumnTransformer`,
it corresponds to the name of the step.
name_details : str
The details to show as content in the dropdown part of the toggleable label. It
can contain information such as non-default parameters or column information for
`ColumnTransformer`.
outer_class : {"sk-label-container", "sk-item"}, default="sk-label-container"
The CSS class for the outer container.
inner_class : {"sk-label", "sk-estimator"}, default="sk-label"
The CSS class for the inner container.
checked : bool, default=False
Whether the dropdown is folded or not. With a single estimator, we intend to
unfold the content.
doc_link : str, default=""
The link to the documentation for the estimator. If an empty string, no link is
added to the diagram. This can be generated for an estimator if it uses the
`_HTMLDocumentationLinkMixin`.
is_fitted_css_class : {"", "fitted"}
The CSS class to indicate whether or not the estimator is fitted. The
empty string means that the estimator is not fitted and "fitted" means that the
estimator is fitted.
is_fitted_icon : str, default=""
The HTML representation to show the fitted information in the diagram. An empty
string means that no information is shown.
"""
# we need to add some padding to the left of the label to be sure it is centered
padding_label = "&nbsp;" if is_fitted_icon else "" # add padding for the "i" char
out.write(
f'<div class="{outer_class}"><div'
f' class="{inner_class} {is_fitted_css_class} sk-toggleable">'
)
name = html.escape(name)
if name_details is not None:
name_details = html.escape(str(name_details))
label_class = (
f"sk-toggleable__label {is_fitted_css_class} sk-toggleable__label-arrow"
)
checked_str = "checked" if checked else ""
est_id = _ESTIMATOR_ID_COUNTER.get_id()
if doc_link:
doc_label = "<span>Online documentation</span>"
if name is not None:
doc_label = f"<span>Documentation for {name}</span>"
doc_link = (
f'<a class="sk-estimator-doc-link {is_fitted_css_class}"'
f' rel="noreferrer" target="_blank" href="{doc_link}">?{doc_label}</a>'
)
padding_label += "&nbsp;" # add additional padding for the "?" char
fmt_str = (
'<input class="sk-toggleable__control sk-hidden--visually"'
f' id="{est_id}" '
f'type="checkbox" {checked_str}><label for="{est_id}" '
f'class="{label_class} {is_fitted_css_class}">{padding_label}{name}'
f"{doc_link}{is_fitted_icon}</label><div "
f'class="sk-toggleable__content {is_fitted_css_class}">'
f"<pre>{name_details}</pre></div> "
)
out.write(fmt_str)
else:
out.write(f"<label>{name}</label>")
out.write("</div></div>") # outer_class inner_class
def _get_visual_block(estimator):
"""Generate information about how to display an estimator."""
if hasattr(estimator, "_sk_visual_block_"):
try:
return estimator._sk_visual_block_()
except Exception:
return _VisualBlock(
"single",
estimator,
names=estimator.__class__.__name__,
name_details=str(estimator),
)
if isinstance(estimator, str):
return _VisualBlock(
"single", estimator, names=estimator, name_details=estimator
)
elif estimator is None:
return _VisualBlock("single", estimator, names="None", name_details="None")
# check if estimator looks like a meta estimator (wraps estimators)
if hasattr(estimator, "get_params") and not isclass(estimator):
estimators = [
(key, est)
for key, est in estimator.get_params(deep=False).items()
if hasattr(est, "get_params") and hasattr(est, "fit") and not isclass(est)
]
if estimators:
return _VisualBlock(
"parallel",
[est for _, est in estimators],
names=[f"{key}: {est.__class__.__name__}" for key, est in estimators],
name_details=[str(est) for _, est in estimators],
)
return _VisualBlock(
"single",
estimator,
names=estimator.__class__.__name__,
name_details=str(estimator),
)
def _write_estimator_html(
out,
estimator,
estimator_label,
estimator_label_details,
is_fitted_css_class,
is_fitted_icon="",
first_call=False,
):
"""Write estimator to html in serial, parallel, or by itself (single).
For multiple estimators, this function is called recursively.
Parameters
----------
out : file-like object
The file to write the HTML representation to.
estimator : estimator object
The estimator to visualize.
estimator_label : str
The label for the estimator. It corresponds either to the estimator class name
for simple estimator or in the case of `Pipeline` and `ColumnTransformer`, it
corresponds to the name of the step.
estimator_label_details : str
The details to show as content in the dropdown part of the toggleable label.
It can contain information as non-default parameters or column information for
`ColumnTransformer`.
is_fitted_css_class : {"", "fitted"}
The CSS class to indicate whether or not the estimator is fitted or not. The
empty string means that the estimator is not fitted and "fitted" means that the
estimator is fitted.
is_fitted_icon : str, default=""
The HTML representation to show the fitted information in the diagram. An empty
string means that no information is shown. If the estimator to be shown is not
the first estimator (i.e. `first_call=False`), `is_fitted_icon` is always an
empty string.
first_call : bool, default=False
Whether this is the first time this function is called.
"""
if first_call:
est_block = _get_visual_block(estimator)
else:
is_fitted_icon = ""
with config_context(print_changed_only=True):
est_block = _get_visual_block(estimator)
# `estimator` can also be an instance of `_VisualBlock`
if hasattr(estimator, "_get_doc_link"):
doc_link = estimator._get_doc_link()
else:
doc_link = ""
if est_block.kind in ("serial", "parallel"):
dashed_wrapped = first_call or est_block.dash_wrapped
dash_cls = " sk-dashed-wrapped" if dashed_wrapped else ""
out.write(f'<div class="sk-item{dash_cls}">')
if estimator_label:
_write_label_html(
out,
estimator_label,
estimator_label_details,
doc_link=doc_link,
is_fitted_css_class=is_fitted_css_class,
is_fitted_icon=is_fitted_icon,
)
kind = est_block.kind
out.write(f'<div class="sk-{kind}">')
est_infos = zip(est_block.estimators, est_block.names, est_block.name_details)
for est, name, name_details in est_infos:
if kind == "serial":
_write_estimator_html(
out,
est,
name,
name_details,
is_fitted_css_class=is_fitted_css_class,
)
else: # parallel
out.write('<div class="sk-parallel-item">')
# wrap element in a serial visualblock
serial_block = _VisualBlock("serial", [est], dash_wrapped=False)
_write_estimator_html(
out,
serial_block,
name,
name_details,
is_fitted_css_class=is_fitted_css_class,
)
out.write("</div>") # sk-parallel-item
out.write("</div></div>")
elif est_block.kind == "single":
_write_label_html(
out,
est_block.names,
est_block.name_details,
outer_class="sk-item",
inner_class="sk-estimator",
checked=first_call,
doc_link=doc_link,
is_fitted_css_class=is_fitted_css_class,
is_fitted_icon=is_fitted_icon,
)
def estimator_html_repr(estimator):
"""Build a HTML representation of an estimator.
Read more in the :ref:`User Guide <visualizing_composite_estimators>`.
Parameters
----------
estimator : estimator object
The estimator to visualize.
Returns
-------
html: str
HTML representation of estimator.
Examples
--------
>>> from sklearn.utils._estimator_html_repr import estimator_html_repr
>>> from sklearn.linear_model import LogisticRegression
>>> estimator_html_repr(LogisticRegression())
'<style>...</div>'
"""
from sklearn.exceptions import NotFittedError
from sklearn.utils.validation import check_is_fitted
if not hasattr(estimator, "fit"):
status_label = "<span>Not fitted</span>"
is_fitted_css_class = ""
else:
try:
check_is_fitted(estimator)
status_label = "<span>Fitted</span>"
is_fitted_css_class = "fitted"
except NotFittedError:
status_label = "<span>Not fitted</span>"
is_fitted_css_class = ""
is_fitted_icon = (
f'<span class="sk-estimator-doc-link {is_fitted_css_class}">'
f"i{status_label}</span>"
)
with closing(StringIO()) as out:
container_id = _CONTAINER_ID_COUNTER.get_id()
style_template = Template(_CSS_STYLE)
style_with_id = style_template.substitute(id=container_id)
estimator_str = str(estimator)
# The fallback message is shown by default and loading the CSS sets
# div.sk-text-repr-fallback to display: none to hide the fallback message.
#
# If the notebook is trusted, the CSS is loaded which hides the fallback
# message. If the notebook is not trusted, then the CSS is not loaded and the
# fallback message is shown by default.
#
# The reverse logic applies to HTML repr div.sk-container.
# div.sk-container is hidden by default and the loading the CSS displays it.
fallback_msg = (
"In a Jupyter environment, please rerun this cell to show the HTML"
" representation or trust the notebook. <br />On GitHub, the"
" HTML representation is unable to render, please try loading this page"
" with nbviewer.org."
)
html_template = (
f"<style>{style_with_id}</style>"
f'<div id="{container_id}" class="sk-top-container">'
'<div class="sk-text-repr-fallback">'
f"<pre>{html.escape(estimator_str)}</pre><b>{fallback_msg}</b>"
"</div>"
'<div class="sk-container" hidden>'
)
out.write(html_template)
_write_estimator_html(
out,
estimator,
estimator.__class__.__name__,
estimator_str,
first_call=True,
is_fitted_css_class=is_fitted_css_class,
is_fitted_icon=is_fitted_icon,
)
out.write("</div></div>")
html_output = out.getvalue()
return html_output
class _HTMLDocumentationLinkMixin:
"""Mixin class allowing to generate a link to the API documentation.
This mixin relies on three attributes:
- `_doc_link_module`: it corresponds to the root module (e.g. `sklearn`). Using this
mixin, the default value is `sklearn`.
- `_doc_link_template`: it corresponds to the template used to generate the
link to the API documentation. Using this mixin, the default value is
`"https://scikit-learn.org/{version_url}/modules/generated/
{estimator_module}.{estimator_name}.html"`.
- `_doc_link_url_param_generator`: it corresponds to a function that generates the
parameters to be used in the template when the estimator module and name are not
sufficient.
The method :meth:`_get_doc_link` generates the link to the API documentation for a
given estimator.
This useful provides all the necessary states for
:func:`sklearn.utils.estimator_html_repr` to generate a link to the API
documentation for the estimator HTML diagram.
Examples
--------
If the default values for `_doc_link_module`, `_doc_link_template` are not suitable,
then you can override them:
>>> from sklearn.base import BaseEstimator
>>> estimator = BaseEstimator()
>>> estimator._doc_link_template = "https://website.com/{single_param}.html"
>>> def url_param_generator(estimator):
... return {"single_param": estimator.__class__.__name__}
>>> estimator._doc_link_url_param_generator = url_param_generator
>>> estimator._get_doc_link()
'https://website.com/BaseEstimator.html'
"""
_doc_link_module = "sklearn"
_doc_link_url_param_generator = None
@property
def _doc_link_template(self):
sklearn_version = parse_version(__version__)
if sklearn_version.dev is None:
version_url = f"{sklearn_version.major}.{sklearn_version.minor}"
else:
version_url = "dev"
return getattr(
self,
"__doc_link_template",
(
f"https://scikit-learn.org/{version_url}/modules/generated/"
"{estimator_module}.{estimator_name}.html"
),
)
@_doc_link_template.setter
def _doc_link_template(self, value):
setattr(self, "__doc_link_template", value)
def _get_doc_link(self):
"""Generates a link to the API documentation for a given estimator.
This method generates the link to the estimator's documentation page
by using the template defined by the attribute `_doc_link_template`.
Returns
-------
url : str
The URL to the API documentation for this estimator. If the estimator does
not belong to module `_doc_link_module`, the empty string (i.e. `""`) is
returned.
"""
if self.__class__.__module__.split(".")[0] != self._doc_link_module:
return ""
if self._doc_link_url_param_generator is None:
estimator_name = self.__class__.__name__
# Construct the estimator's module name, up to the first private submodule.
# This works because in scikit-learn all public estimators are exposed at
# that level, even if they actually live in a private sub-module.
estimator_module = ".".join(
itertools.takewhile(
lambda part: not part.startswith("_"),
self.__class__.__module__.split("."),
)
)
return self._doc_link_template.format(
estimator_module=estimator_module, estimator_name=estimator_name
)
return self._doc_link_template.format(
**self._doc_link_url_param_generator(self)
)
@@ -0,0 +1,18 @@
# Author: Gael Varoquaux
# License: BSD
"""
Uses C++ map containers for fast dict-like behavior with keys being
integers, and values float.
"""
from libcpp.map cimport map as cpp_map
from ._typedefs cimport float64_t, intp_t
###############################################################################
# An object to be used in Python
cdef class IntFloatDict:
cdef cpp_map[intp_t, float64_t] my_map
cdef _to_arrays(self, intp_t [:] keys, float64_t [:] values)
@@ -0,0 +1,137 @@
"""
Uses C++ map containers for fast dict-like behavior with keys being
integers, and values float.
"""
# Author: Gael Varoquaux
# License: BSD
# C++
from cython.operator cimport dereference as deref, preincrement as inc
from libcpp.utility cimport pair
from libcpp.map cimport map as cpp_map
import numpy as np
from ._typedefs cimport float64_t, intp_t
###############################################################################
# An object to be used in Python
# Lookup is faster than dict (up to 10 times), and so is full traversal
# (up to 50 times), and assignment (up to 6 times), but creation is
# slower (up to 3 times). Also, a large benefit is that memory
# consumption is reduced a lot compared to a Python dict
cdef class IntFloatDict:
def __init__(
self,
intp_t[:] keys,
float64_t[:] values,
):
cdef int i
cdef int size = values.size
# Should check that sizes for keys and values are equal, and
# after should boundcheck(False)
for i in range(size):
self.my_map[keys[i]] = values[i]
def __len__(self):
return self.my_map.size()
def __getitem__(self, int key):
cdef cpp_map[intp_t, float64_t].iterator it = self.my_map.find(key)
if it == self.my_map.end():
# The key is not in the dict
raise KeyError('%i' % key)
return deref(it).second
def __setitem__(self, int key, float value):
self.my_map[key] = value
# Cython 0.20 generates buggy code below. Commenting this out for now
# and relying on the to_arrays method
# def __iter__(self):
# cdef cpp_map[intp_t, float64_t].iterator it = self.my_map.begin()
# cdef cpp_map[intp_t, float64_t].iterator end = self.my_map.end()
# while it != end:
# yield deref(it).first, deref(it).second
# inc(it)
def __iter__(self):
cdef int size = self.my_map.size()
cdef intp_t [:] keys = np.empty(size, dtype=np.intp)
cdef float64_t [:] values = np.empty(size, dtype=np.float64)
self._to_arrays(keys, values)
cdef int idx
cdef intp_t key
cdef float64_t value
for idx in range(size):
key = keys[idx]
value = values[idx]
yield key, value
def to_arrays(self):
"""Return the key, value representation of the IntFloatDict
object.
Returns
=======
keys : ndarray, shape (n_items, ), dtype=int
The indices of the data points
values : ndarray, shape (n_items, ), dtype=float
The values of the data points
"""
cdef int size = self.my_map.size()
keys = np.empty(size, dtype=np.intp)
values = np.empty(size, dtype=np.float64)
self._to_arrays(keys, values)
return keys, values
cdef _to_arrays(self, intp_t [:] keys, float64_t [:] values):
# Internal version of to_arrays that takes already-initialized arrays
cdef cpp_map[intp_t, float64_t].iterator it = self.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator end = self.my_map.end()
cdef int index = 0
while it != end:
keys[index] = deref(it).first
values[index] = deref(it).second
inc(it)
index += 1
def update(self, IntFloatDict other):
cdef cpp_map[intp_t, float64_t].iterator it = other.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator end = other.my_map.end()
while it != end:
self.my_map[deref(it).first] = deref(it).second
inc(it)
def copy(self):
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
# The '=' operator is a copy operator for C++ maps
out_obj.my_map = self.my_map
return out_obj
def append(self, intp_t key, float64_t value):
# Construct our arguments
cdef pair[intp_t, float64_t] args
args.first = key
args.second = value
self.my_map.insert(args)
###############################################################################
# operation on dict
def argmin(IntFloatDict d):
cdef cpp_map[intp_t, float64_t].iterator it = d.my_map.begin()
cdef cpp_map[intp_t, float64_t].iterator end = d.my_map.end()
cdef intp_t min_key = -1
cdef float64_t min_value = np.inf
while it != end:
if deref(it).second < min_value:
min_value = deref(it).second
min_key = deref(it).first
inc(it)
return min_key, min_value
@@ -0,0 +1,14 @@
# Heap routines, used in various Cython implementations.
from cython cimport floating
from ._typedefs cimport intp_t
cdef int heap_push(
floating* values,
intp_t* indices,
intp_t size,
floating val,
intp_t val_idx,
) noexcept nogil
@@ -0,0 +1,85 @@
from cython cimport floating
from ._typedefs cimport intp_t
cdef inline int heap_push(
floating* values,
intp_t* indices,
intp_t size,
floating val,
intp_t val_idx,
) noexcept nogil:
"""Push a tuple (val, val_idx) onto a fixed-size max-heap.
The max-heap is represented as a Structure of Arrays where:
- values is the array containing the data to construct the heap with
- indices is the array containing the indices (meta-data) of each value
Notes
-----
Arrays are manipulated via a pointer to there first element and their size
as to ease the processing of dynamically allocated buffers.
For instance, in pseudo-code:
values = [1.2, 0.4, 0.1],
indices = [42, 1, 5],
heap_push(
values=values,
indices=indices,
size=3,
val=0.2,
val_idx=4,
)
will modify values and indices inplace, giving at the end of the call:
values == [0.4, 0.2, 0.1]
indices == [1, 4, 5]
"""
cdef:
intp_t current_idx, left_child_idx, right_child_idx, swap_idx
# Check if val should be in heap
if val >= values[0]:
return 0
# Insert val at position zero
values[0] = val
indices[0] = val_idx
# Descend the heap, swapping values until the max heap criterion is met
current_idx = 0
while True:
left_child_idx = 2 * current_idx + 1
right_child_idx = left_child_idx + 1
if left_child_idx >= size:
break
elif right_child_idx >= size:
if values[left_child_idx] > val:
swap_idx = left_child_idx
else:
break
elif values[left_child_idx] >= values[right_child_idx]:
if val < values[left_child_idx]:
swap_idx = left_child_idx
else:
break
else:
if val < values[right_child_idx]:
swap_idx = right_child_idx
else:
break
values[current_idx] = values[swap_idx]
indices[current_idx] = indices[swap_idx]
current_idx = swap_idx
values[current_idx] = val
indices[current_idx] = val_idx
return 0
@@ -0,0 +1,635 @@
import numbers
import sys
import warnings
from collections import UserList
from itertools import compress, islice
import numpy as np
from scipy.sparse import issparse
from ._array_api import _is_numpy_namespace, get_namespace
from ._param_validation import Interval, validate_params
from .extmath import _approximate_mode
from .validation import (
_is_arraylike_not_scalar,
_is_pandas_df,
_is_polars_df_or_series,
_use_interchange_protocol,
check_array,
check_consistent_length,
check_random_state,
)
def _array_indexing(array, key, key_dtype, axis):
"""Index an array or scipy.sparse consistently across NumPy version."""
xp, is_array_api = get_namespace(array)
if is_array_api:
return xp.take(array, key, axis=axis)
if issparse(array) and key_dtype == "bool":
key = np.asarray(key)
if isinstance(key, tuple):
key = list(key)
return array[key, ...] if axis == 0 else array[:, key]
def _pandas_indexing(X, key, key_dtype, axis):
"""Index a pandas dataframe or a series."""
if _is_arraylike_not_scalar(key):
key = np.asarray(key)
if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)):
# using take() instead of iloc[] ensures the return value is a "proper"
# copy that will not raise SettingWithCopyWarning
return X.take(key, axis=axis)
else:
# check whether we should index with loc or iloc
indexer = X.iloc if key_dtype == "int" else X.loc
return indexer[:, key] if axis else indexer[key]
def _list_indexing(X, key, key_dtype):
"""Index a Python list."""
if np.isscalar(key) or isinstance(key, slice):
# key is a slice or a scalar
return X[key]
if key_dtype == "bool":
# key is a boolean array-like
return list(compress(X, key))
# key is a integer array-like of key
return [X[idx] for idx in key]
def _polars_indexing(X, key, key_dtype, axis):
"""Indexing X with polars interchange protocol."""
# Polars behavior is more consistent with lists
if isinstance(key, np.ndarray):
# Convert each element of the array to a Python scalar
key = key.tolist()
elif not (np.isscalar(key) or isinstance(key, slice)):
key = list(key)
if axis == 1:
# Here we are certain to have a polars DataFrame; which can be indexed with
# integer and string scalar, and list of integer, string and boolean
return X[:, key]
if key_dtype == "bool":
# Boolean mask can be indexed in the same way for Series and DataFrame (axis=0)
return X.filter(key)
# Integer scalar and list of integer can be indexed in the same way for Series and
# DataFrame (axis=0)
X_indexed = X[key]
if np.isscalar(key) and len(X.shape) == 2:
# `X_indexed` is a DataFrame with a single row; we return a Series to be
# consistent with pandas
pl = sys.modules["polars"]
return pl.Series(X_indexed.row(0))
return X_indexed
def _determine_key_type(key, accept_slice=True):
"""Determine the data type of key.
Parameters
----------
key : scalar, slice or array-like
The key from which we want to infer the data type.
accept_slice : bool, default=True
Whether or not to raise an error if the key is a slice.
Returns
-------
dtype : {'int', 'str', 'bool', None}
Returns the data type of key.
"""
err_msg = (
"No valid specification of the columns. Only a scalar, list or "
"slice of all integers or all strings, or boolean mask is "
"allowed"
)
dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"}
array_dtype_to_str = {
"i": "int",
"u": "int",
"b": "bool",
"O": "str",
"U": "str",
"S": "str",
}
if key is None:
return None
if isinstance(key, tuple(dtype_to_str.keys())):
try:
return dtype_to_str[type(key)]
except KeyError:
raise ValueError(err_msg)
if isinstance(key, slice):
if not accept_slice:
raise TypeError(
"Only array-like or scalar are supported. A Python slice was given."
)
if key.start is None and key.stop is None:
return None
key_start_type = _determine_key_type(key.start)
key_stop_type = _determine_key_type(key.stop)
if key_start_type is not None and key_stop_type is not None:
if key_start_type != key_stop_type:
raise ValueError(err_msg)
if key_start_type is not None:
return key_start_type
return key_stop_type
# TODO(1.9) remove UserList when the force_int_remainder_cols param
# of ColumnTransformer is removed
if isinstance(key, (list, tuple, UserList)):
unique_key = set(key)
key_type = {_determine_key_type(elt) for elt in unique_key}
if not key_type:
return None
if len(key_type) != 1:
raise ValueError(err_msg)
return key_type.pop()
if hasattr(key, "dtype"):
xp, is_array_api = get_namespace(key)
# NumPy arrays are special-cased in their own branch because the Array API
# cannot handle object/string-based dtypes that are often used to index
# columns of dataframes by names.
if is_array_api and not _is_numpy_namespace(xp):
if xp.isdtype(key.dtype, "bool"):
return "bool"
elif xp.isdtype(key.dtype, "integral"):
return "int"
else:
raise ValueError(err_msg)
else:
try:
return array_dtype_to_str[key.dtype.kind]
except KeyError:
raise ValueError(err_msg)
raise ValueError(err_msg)
def _safe_indexing(X, indices, *, axis=0):
"""Return rows, items or columns of X using indices.
.. warning::
This utility is documented, but **private**. This means that
backward compatibility might be broken without any deprecation
cycle.
Parameters
----------
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
Data from which to sample rows, items or columns. `list` are only
supported when `axis=0`.
indices : bool, int, str, slice, array-like
- If `axis=0`, boolean and integer array-like, integer slice,
and scalar integer are supported.
- If `axis=1`:
- to select a single column, `indices` can be of `int` type for
all `X` types and `str` only for dataframe. The selected subset
will be 1D, unless `X` is a sparse matrix in which case it will
be 2D.
- to select multiples columns, `indices` can be one of the
following: `list`, `array`, `slice`. The type used in
these containers can be one of the following: `int`, 'bool' and
`str`. However, `str` is only supported when `X` is a dataframe.
The selected subset will be 2D.
axis : int, default=0
The axis along which `X` will be subsampled. `axis=0` will select
rows while `axis=1` will select columns.
Returns
-------
subset
Subset of X on axis 0 or 1.
Notes
-----
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
not supported.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils import _safe_indexing
>>> data = np.array([[1, 2], [3, 4], [5, 6]])
>>> _safe_indexing(data, 0, axis=0) # select the first row
array([1, 2])
>>> _safe_indexing(data, 0, axis=1) # select the first column
array([1, 3, 5])
"""
if indices is None:
return X
if axis not in (0, 1):
raise ValueError(
"'axis' should be either 0 (to index rows) or 1 (to index "
" column). Got {} instead.".format(axis)
)
indices_dtype = _determine_key_type(indices)
if axis == 0 and indices_dtype == "str":
raise ValueError("String indexing is not supported with 'axis=0'")
if axis == 1 and isinstance(X, list):
raise ValueError("axis=1 is not supported for lists")
if axis == 1 and hasattr(X, "shape") and len(X.shape) != 2:
raise ValueError(
"'X' should be a 2D NumPy array, 2D sparse matrix or "
"dataframe when indexing the columns (i.e. 'axis=1'). "
"Got {} instead with {} dimension(s).".format(type(X), len(X.shape))
)
if (
axis == 1
and indices_dtype == "str"
and not (_is_pandas_df(X) or _use_interchange_protocol(X))
):
raise ValueError(
"Specifying the columns using strings is only supported for dataframes."
)
if hasattr(X, "iloc"):
# TODO: we should probably use _is_pandas_df_or_series(X) instead but this
# would require updating some tests such as test_train_test_split_mock_pandas.
return _pandas_indexing(X, indices, indices_dtype, axis=axis)
elif _is_polars_df_or_series(X):
return _polars_indexing(X, indices, indices_dtype, axis=axis)
elif hasattr(X, "shape"):
return _array_indexing(X, indices, indices_dtype, axis=axis)
else:
return _list_indexing(X, indices, indices_dtype)
def _safe_assign(X, values, *, row_indexer=None, column_indexer=None):
"""Safe assignment to a numpy array, sparse matrix, or pandas dataframe.
Parameters
----------
X : {ndarray, sparse-matrix, dataframe}
Array to be modified. It is expected to be 2-dimensional.
values : ndarray
The values to be assigned to `X`.
row_indexer : array-like, dtype={int, bool}, default=None
A 1-dimensional array to select the rows of interest. If `None`, all
rows are selected.
column_indexer : array-like, dtype={int, bool}, default=None
A 1-dimensional array to select the columns of interest. If `None`, all
columns are selected.
"""
row_indexer = slice(None, None, None) if row_indexer is None else row_indexer
column_indexer = (
slice(None, None, None) if column_indexer is None else column_indexer
)
if hasattr(X, "iloc"): # pandas dataframe
with warnings.catch_warnings():
# pandas >= 1.5 raises a warning when using iloc to set values in a column
# that does not have the same type as the column being set. It happens
# for instance when setting a categorical column with a string.
# In the future the behavior won't change and the warning should disappear.
# TODO(1.3): check if the warning is still raised or remove the filter.
warnings.simplefilter("ignore", FutureWarning)
X.iloc[row_indexer, column_indexer] = values
else: # numpy array or sparse matrix
X[row_indexer, column_indexer] = values
def _get_column_indices_for_bool_or_int(key, n_columns):
# Convert key into list of positive integer indexes
try:
idx = _safe_indexing(np.arange(n_columns), key)
except IndexError as e:
raise ValueError(
f"all features must be in [0, {n_columns - 1}] or [-{n_columns}, 0]"
) from e
return np.atleast_1d(idx).tolist()
def _get_column_indices(X, key):
"""Get feature column indices for input data X and key.
For accepted values of `key`, see the docstring of
:func:`_safe_indexing`.
"""
key_dtype = _determine_key_type(key)
if _use_interchange_protocol(X):
return _get_column_indices_interchange(X.__dataframe__(), key, key_dtype)
n_columns = X.shape[1]
if isinstance(key, (list, tuple)) and not key:
# we get an empty list
return []
elif key_dtype in ("bool", "int"):
return _get_column_indices_for_bool_or_int(key, n_columns)
else:
try:
all_columns = X.columns
except AttributeError:
raise ValueError(
"Specifying the columns using strings is only supported for dataframes."
)
if isinstance(key, str):
columns = [key]
elif isinstance(key, slice):
start, stop = key.start, key.stop
if start is not None:
start = all_columns.get_loc(start)
if stop is not None:
# pandas indexing with strings is endpoint included
stop = all_columns.get_loc(stop) + 1
else:
stop = n_columns + 1
return list(islice(range(n_columns), start, stop))
else:
columns = list(key)
try:
column_indices = []
for col in columns:
col_idx = all_columns.get_loc(col)
if not isinstance(col_idx, numbers.Integral):
raise ValueError(
f"Selected columns, {columns}, are not unique in dataframe"
)
column_indices.append(col_idx)
except KeyError as e:
raise ValueError("A given column is not a column of the dataframe") from e
return column_indices
def _get_column_indices_interchange(X_interchange, key, key_dtype):
"""Same as _get_column_indices but for X with __dataframe__ protocol."""
n_columns = X_interchange.num_columns()
if isinstance(key, (list, tuple)) and not key:
# we get an empty list
return []
elif key_dtype in ("bool", "int"):
return _get_column_indices_for_bool_or_int(key, n_columns)
else:
column_names = list(X_interchange.column_names())
if isinstance(key, slice):
if key.step not in [1, None]:
raise NotImplementedError("key.step must be 1 or None")
start, stop = key.start, key.stop
if start is not None:
start = column_names.index(start)
if stop is not None:
stop = column_names.index(stop) + 1
else:
stop = n_columns + 1
return list(islice(range(n_columns), start, stop))
selected_columns = [key] if np.isscalar(key) else key
try:
return [column_names.index(col) for col in selected_columns]
except ValueError as e:
raise ValueError("A given column is not a column of the dataframe") from e
@validate_params(
{
"replace": ["boolean"],
"n_samples": [Interval(numbers.Integral, 1, None, closed="left"), None],
"random_state": ["random_state"],
"stratify": ["array-like", "sparse matrix", None],
},
prefer_skip_nested_validation=True,
)
def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None):
"""Resample arrays or sparse matrices in a consistent way.
The default strategy implements one step of the bootstrapping
procedure.
Parameters
----------
*arrays : sequence of array-like of shape (n_samples,) or \
(n_samples, n_outputs)
Indexable data-structures can be arrays, lists, dataframes or scipy
sparse matrices with consistent first dimension.
replace : bool, default=True
Implements resampling with replacement. If False, this will implement
(sliced) random permutations.
n_samples : int, default=None
Number of samples to generate. If left to None this is
automatically set to the first dimension of the arrays.
If replace is False it should not be larger than the length of
arrays.
random_state : int, RandomState instance or None, default=None
Determines random number generation for shuffling
the data.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
stratify : {array-like, sparse matrix} of shape (n_samples,) or \
(n_samples, n_outputs), default=None
If not None, data is split in a stratified fashion, using this as
the class labels.
Returns
-------
resampled_arrays : sequence of array-like of shape (n_samples,) or \
(n_samples, n_outputs)
Sequence of resampled copies of the collections. The original arrays
are not impacted.
See Also
--------
shuffle : Shuffle arrays or sparse matrices in a consistent way.
Examples
--------
It is possible to mix sparse and dense arrays in the same run::
>>> import numpy as np
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
>>> y = np.array([0, 1, 2])
>>> from scipy.sparse import coo_matrix
>>> X_sparse = coo_matrix(X)
>>> from sklearn.utils import resample
>>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
>>> X
array([[1., 0.],
[2., 1.],
[1., 0.]])
>>> X_sparse
<3x2 sparse matrix of type '<... 'numpy.float64'>'
with 4 stored elements in Compressed Sparse Row format>
>>> X_sparse.toarray()
array([[1., 0.],
[2., 1.],
[1., 0.]])
>>> y
array([0, 1, 0])
>>> resample(y, n_samples=2, random_state=0)
array([0, 1])
Example using stratification::
>>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]
>>> resample(y, n_samples=5, replace=False, stratify=y,
... random_state=0)
[1, 1, 1, 0, 1]
"""
max_n_samples = n_samples
random_state = check_random_state(random_state)
if len(arrays) == 0:
return None
first = arrays[0]
n_samples = first.shape[0] if hasattr(first, "shape") else len(first)
if max_n_samples is None:
max_n_samples = n_samples
elif (max_n_samples > n_samples) and (not replace):
raise ValueError(
"Cannot sample %d out of arrays with dim %d when replace is False"
% (max_n_samples, n_samples)
)
check_consistent_length(*arrays)
if stratify is None:
if replace:
indices = random_state.randint(0, n_samples, size=(max_n_samples,))
else:
indices = np.arange(n_samples)
random_state.shuffle(indices)
indices = indices[:max_n_samples]
else:
# Code adapted from StratifiedShuffleSplit()
y = check_array(stratify, ensure_2d=False, dtype=None)
if y.ndim == 2:
# for multi-label y, map each distinct row to a string repr
# using join because str(row) uses an ellipsis if len(row) > 1000
y = np.array([" ".join(row.astype("str")) for row in y])
classes, y_indices = np.unique(y, return_inverse=True)
n_classes = classes.shape[0]
class_counts = np.bincount(y_indices)
# Find the sorted list of instances for each class:
# (np.unique above performs a sort, so code is O(n logn) already)
class_indices = np.split(
np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
)
n_i = _approximate_mode(class_counts, max_n_samples, random_state)
indices = []
for i in range(n_classes):
indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace)
indices.extend(indices_i)
indices = random_state.permutation(indices)
# convert sparse matrices to CSR for row-based indexing
arrays = [a.tocsr() if issparse(a) else a for a in arrays]
resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
if len(resampled_arrays) == 1:
# syntactic sugar for the unit argument case
return resampled_arrays[0]
else:
return resampled_arrays
def shuffle(*arrays, random_state=None, n_samples=None):
"""Shuffle arrays or sparse matrices in a consistent way.
This is a convenience alias to ``resample(*arrays, replace=False)`` to do
random permutations of the collections.
Parameters
----------
*arrays : sequence of indexable data-structures
Indexable data-structures can be arrays, lists, dataframes or scipy
sparse matrices with consistent first dimension.
random_state : int, RandomState instance or None, default=None
Determines random number generation for shuffling
the data.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
n_samples : int, default=None
Number of samples to generate. If left to None this is
automatically set to the first dimension of the arrays. It should
not be larger than the length of arrays.
Returns
-------
shuffled_arrays : sequence of indexable data-structures
Sequence of shuffled copies of the collections. The original arrays
are not impacted.
See Also
--------
resample : Resample arrays or sparse matrices in a consistent way.
Examples
--------
It is possible to mix sparse and dense arrays in the same run::
>>> import numpy as np
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
>>> y = np.array([0, 1, 2])
>>> from scipy.sparse import coo_matrix
>>> X_sparse = coo_matrix(X)
>>> from sklearn.utils import shuffle
>>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
>>> X
array([[0., 0.],
[2., 1.],
[1., 0.]])
>>> X_sparse
<3x2 sparse matrix of type '<... 'numpy.float64'>'
with 3 stored elements in Compressed Sparse Row format>
>>> X_sparse.toarray()
array([[0., 0.],
[2., 1.],
[1., 0.]])
>>> y
array([2, 1, 0])
>>> shuffle(y, n_samples=2, random_state=0)
array([0, 1])
"""
return resample(
*arrays, replace=False, n_samples=n_samples, random_state=random_state
)
@@ -0,0 +1,50 @@
# Author: John Kirkham, Meekail Zain, Thomas Fan
from libc.math cimport isnan, isinf
from cython cimport floating
cpdef enum FiniteStatus:
all_finite = 0
has_nan = 1
has_infinite = 2
def cy_isfinite(floating[::1] a, bint allow_nan=False):
cdef FiniteStatus result
with nogil:
result = _isfinite(a, allow_nan)
return result
cdef inline FiniteStatus _isfinite(floating[::1] a, bint allow_nan) noexcept nogil:
cdef floating* a_ptr = &a[0]
cdef Py_ssize_t length = len(a)
if allow_nan:
return _isfinite_allow_nan(a_ptr, length)
else:
return _isfinite_disable_nan(a_ptr, length)
cdef inline FiniteStatus _isfinite_allow_nan(floating* a_ptr,
Py_ssize_t length) noexcept nogil:
cdef Py_ssize_t i
cdef floating v
for i in range(length):
v = a_ptr[i]
if isinf(v):
return FiniteStatus.has_infinite
return FiniteStatus.all_finite
cdef inline FiniteStatus _isfinite_disable_nan(floating* a_ptr,
Py_ssize_t length) noexcept nogil:
cdef Py_ssize_t i
cdef floating v
for i in range(length):
v = a_ptr[i]
if isnan(v):
return FiniteStatus.has_nan
elif isinf(v):
return FiniteStatus.has_infinite
return FiniteStatus.all_finite
@@ -0,0 +1,40 @@
# TODO(1.7): remove this file
import warnings as _warnings
with _warnings.catch_warnings():
_warnings.simplefilter("ignore")
# joblib imports may raise DeprecationWarning on certain Python
# versions
import joblib
from joblib import (
Memory,
Parallel,
__version__,
cpu_count,
delayed,
dump,
effective_n_jobs,
hash,
load,
logger,
parallel_backend,
register_parallel_backend,
)
__all__ = [
"parallel_backend",
"register_parallel_backend",
"cpu_count",
"Parallel",
"Memory",
"delayed",
"effective_n_jobs",
"hash",
"logger",
"dump",
"load",
"joblib",
"__version__",
]
@@ -0,0 +1,178 @@
from contextlib import suppress
import numpy as np
from scipy import sparse as sp
from ._missing import is_scalar_nan
from ._param_validation import validate_params
from .fixes import _object_dtype_isnan
def _get_dense_mask(X, value_to_mask):
with suppress(ImportError, AttributeError):
# We also suppress `AttributeError` because older versions of pandas do
# not have `NA`.
import pandas
if value_to_mask is pandas.NA:
return pandas.isna(X)
if is_scalar_nan(value_to_mask):
if X.dtype.kind == "f":
Xt = np.isnan(X)
elif X.dtype.kind in ("i", "u"):
# can't have NaNs in integer array.
Xt = np.zeros(X.shape, dtype=bool)
else:
# np.isnan does not work on object dtypes.
Xt = _object_dtype_isnan(X)
else:
Xt = X == value_to_mask
return Xt
def _get_mask(X, value_to_mask):
"""Compute the boolean mask X == value_to_mask.
Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
Input data, where ``n_samples`` is the number of samples and
``n_features`` is the number of features.
value_to_mask : {int, float}
The value which is to be masked in X.
Returns
-------
X_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)
Missing mask.
"""
if not sp.issparse(X):
# For all cases apart of a sparse input where we need to reconstruct
# a sparse output
return _get_dense_mask(X, value_to_mask)
Xt = _get_dense_mask(X.data, value_to_mask)
sparse_constructor = sp.csr_matrix if X.format == "csr" else sp.csc_matrix
Xt_sparse = sparse_constructor(
(Xt, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool
)
return Xt_sparse
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"mask": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def safe_mask(X, mask):
"""Return a mask which is safe to use on X.
Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.
mask : array-like
Mask to be used on X.
Returns
-------
mask : ndarray
Array that is safe to use on X.
Examples
--------
>>> from sklearn.utils import safe_mask
>>> from scipy.sparse import csr_matrix
>>> data = csr_matrix([[1], [2], [3], [4], [5]])
>>> condition = [False, True, True, False, True]
>>> mask = safe_mask(data, condition)
>>> data[mask].toarray()
array([[2],
[3],
[5]])
"""
mask = np.asarray(mask)
if np.issubdtype(mask.dtype, np.signedinteger):
return mask
if hasattr(X, "toarray"):
ind = np.arange(mask.shape[0])
mask = ind[mask]
return mask
def axis0_safe_slice(X, mask, len_mask):
"""Return a mask which is safer to use on X than safe_mask.
This mask is safer than safe_mask since it returns an
empty array, when a sparse matrix is sliced with a boolean mask
with all False, instead of raising an unhelpful error in older
versions of SciPy.
See: https://github.com/scipy/scipy/issues/5361
Also note that we can avoid doing the dot product by checking if
the len_mask is not zero in _huber_loss_and_gradient but this
is not going to be the bottleneck, since the number of outliers
and non_outliers are typically non-zero and it makes the code
tougher to follow.
Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.
mask : ndarray
Mask to be used on X.
len_mask : int
The length of the mask.
Returns
-------
mask : ndarray
Array that is safe to use on X.
"""
if len_mask != 0:
return X[safe_mask(X, mask), :]
return np.zeros(shape=(0, X.shape[1]))
def indices_to_mask(indices, mask_length):
"""Convert list of indices to boolean mask.
Parameters
----------
indices : list-like
List of integers treated as indices.
mask_length : int
Length of boolean mask to be generated.
This parameter must be greater than max(indices).
Returns
-------
mask : 1d boolean nd-array
Boolean array that is True where indices are present, else False.
Examples
--------
>>> from sklearn.utils._mask import indices_to_mask
>>> indices = [1, 2 , 3, 4]
>>> indices_to_mask(indices, 5)
array([False, True, True, True, True])
"""
if mask_length <= np.max(indices):
raise ValueError("mask_length must be greater than max(indices)")
mask = np.zeros(mask_length, dtype=bool)
mask[indices] = True
return mask
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,65 @@
import math
import numbers
from contextlib import suppress
def is_scalar_nan(x):
"""Test if x is NaN.
This function is meant to overcome the issue that np.isnan does not allow
non-numerical types as input, and that np.nan is not float('nan').
Parameters
----------
x : any type
Any scalar value.
Returns
-------
bool
Returns true if x is NaN, and false otherwise.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils._missing import is_scalar_nan
>>> is_scalar_nan(np.nan)
True
>>> is_scalar_nan(float("nan"))
True
>>> is_scalar_nan(None)
False
>>> is_scalar_nan("")
False
>>> is_scalar_nan([np.nan])
False
"""
return (
not isinstance(x, numbers.Integral)
and isinstance(x, numbers.Real)
and math.isnan(x)
)
def is_pandas_na(x):
"""Test if x is pandas.NA.
We intentionally do not use this function to return `True` for `pd.NA` in
`is_scalar_nan`, because estimators that support `pd.NA` are the exception
rather than the rule at the moment. When `pd.NA` is more universally
supported, we may reconsider this decision.
Parameters
----------
x : any type
Returns
-------
boolean
"""
with suppress(ImportError):
from pandas import NA
return x is NA
return False
@@ -0,0 +1,410 @@
import numpy as np
from ..base import BaseEstimator, ClassifierMixin
from ..utils._metadata_requests import RequestMethod
from .metaestimators import available_if
from .validation import (
_check_sample_weight,
_num_samples,
check_array,
check_is_fitted,
check_random_state,
)
class ArraySlicingWrapper:
"""
Parameters
----------
array
"""
def __init__(self, array):
self.array = array
def __getitem__(self, aslice):
return MockDataFrame(self.array[aslice])
class MockDataFrame:
"""
Parameters
----------
array
"""
# have shape and length but don't support indexing.
def __init__(self, array):
self.array = array
self.values = array
self.shape = array.shape
self.ndim = array.ndim
# ugly hack to make iloc work.
self.iloc = ArraySlicingWrapper(array)
def __len__(self):
return len(self.array)
def __array__(self, dtype=None):
# Pandas data frames also are array-like: we want to make sure that
# input validation in cross-validation does not try to call that
# method.
return self.array
def __eq__(self, other):
return MockDataFrame(self.array == other.array)
def __ne__(self, other):
return not self == other
def take(self, indices, axis=0):
return MockDataFrame(self.array.take(indices, axis=axis))
class CheckingClassifier(ClassifierMixin, BaseEstimator):
"""Dummy classifier to test pipelining and meta-estimators.
Checks some property of `X` and `y`in fit / predict.
This allows testing whether pipelines / cross-validation or metaestimators
changed the input.
Can also be used to check if `fit_params` are passed correctly, and
to force a certain score to be returned.
Parameters
----------
check_y, check_X : callable, default=None
The callable used to validate `X` and `y`. These callable should return
a bool where `False` will trigger an `AssertionError`. If `None`, the
data is not validated. Default is `None`.
check_y_params, check_X_params : dict, default=None
The optional parameters to pass to `check_X` and `check_y`. If `None`,
then no parameters are passed in.
methods_to_check : "all" or list of str, default="all"
The methods in which the checks should be applied. By default,
all checks will be done on all methods (`fit`, `predict`,
`predict_proba`, `decision_function` and `score`).
foo_param : int, default=0
A `foo` param. When `foo > 1`, the output of :meth:`score` will be 1
otherwise it is 0.
expected_sample_weight : bool, default=False
Whether to check if a valid `sample_weight` was passed to `fit`.
expected_fit_params : list of str, default=None
A list of the expected parameters given when calling `fit`.
Attributes
----------
classes_ : int
The classes seen during `fit`.
n_features_in_ : int
The number of features seen during `fit`.
Examples
--------
>>> from sklearn.utils._mocking import CheckingClassifier
This helper allow to assert to specificities regarding `X` or `y`. In this
case we expect `check_X` or `check_y` to return a boolean.
>>> from sklearn.datasets import load_iris
>>> X, y = load_iris(return_X_y=True)
>>> clf = CheckingClassifier(check_X=lambda x: x.shape == (150, 4))
>>> clf.fit(X, y)
CheckingClassifier(...)
We can also provide a check which might raise an error. In this case, we
expect `check_X` to return `X` and `check_y` to return `y`.
>>> from sklearn.utils import check_array
>>> clf = CheckingClassifier(check_X=check_array)
>>> clf.fit(X, y)
CheckingClassifier(...)
"""
def __init__(
self,
*,
check_y=None,
check_y_params=None,
check_X=None,
check_X_params=None,
methods_to_check="all",
foo_param=0,
expected_sample_weight=None,
expected_fit_params=None,
random_state=None,
):
self.check_y = check_y
self.check_y_params = check_y_params
self.check_X = check_X
self.check_X_params = check_X_params
self.methods_to_check = methods_to_check
self.foo_param = foo_param
self.expected_sample_weight = expected_sample_weight
self.expected_fit_params = expected_fit_params
self.random_state = random_state
def _check_X_y(self, X, y=None, should_be_fitted=True):
"""Validate X and y and make extra check.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data set.
`X` is checked only if `check_X` is not `None` (default is None).
y : array-like of shape (n_samples), default=None
The corresponding target, by default `None`.
`y` is checked only if `check_y` is not `None` (default is None).
should_be_fitted : bool, default=True
Whether or not the classifier should be already fitted.
By default True.
Returns
-------
X, y
"""
if should_be_fitted:
check_is_fitted(self)
if self.check_X is not None:
params = {} if self.check_X_params is None else self.check_X_params
checked_X = self.check_X(X, **params)
if isinstance(checked_X, (bool, np.bool_)):
assert checked_X
else:
X = checked_X
if y is not None and self.check_y is not None:
params = {} if self.check_y_params is None else self.check_y_params
checked_y = self.check_y(y, **params)
if isinstance(checked_y, (bool, np.bool_)):
assert checked_y
else:
y = checked_y
return X, y
def fit(self, X, y, sample_weight=None, **fit_params):
"""Fit classifier.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : array-like of shape (n_samples, n_outputs) or (n_samples,), \
default=None
Target relative to X for classification or regression;
None for unsupervised learning.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
**fit_params : dict of string -> object
Parameters passed to the ``fit`` method of the estimator
Returns
-------
self
"""
assert _num_samples(X) == _num_samples(y)
if self.methods_to_check == "all" or "fit" in self.methods_to_check:
X, y = self._check_X_y(X, y, should_be_fitted=False)
self.n_features_in_ = np.shape(X)[1]
self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True))
if self.expected_fit_params:
missing = set(self.expected_fit_params) - set(fit_params)
if missing:
raise AssertionError(
f"Expected fit parameter(s) {list(missing)} not seen."
)
for key, value in fit_params.items():
if _num_samples(value) != _num_samples(X):
raise AssertionError(
f"Fit parameter {key} has length {_num_samples(value)}"
f"; expected {_num_samples(X)}."
)
if self.expected_sample_weight:
if sample_weight is None:
raise AssertionError("Expected sample_weight to be passed")
_check_sample_weight(sample_weight, X)
return self
def predict(self, X):
"""Predict the first class seen in `classes_`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data.
Returns
-------
preds : ndarray of shape (n_samples,)
Predictions of the first class seens in `classes_`.
"""
if self.methods_to_check == "all" or "predict" in self.methods_to_check:
X, y = self._check_X_y(X)
rng = check_random_state(self.random_state)
return rng.choice(self.classes_, size=_num_samples(X))
def predict_proba(self, X):
"""Predict probabilities for each class.
Here, the dummy classifier will provide a probability of 1 for the
first class of `classes_` and 0 otherwise.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data.
Returns
-------
proba : ndarray of shape (n_samples, n_classes)
The probabilities for each sample and class.
"""
if self.methods_to_check == "all" or "predict_proba" in self.methods_to_check:
X, y = self._check_X_y(X)
rng = check_random_state(self.random_state)
proba = rng.randn(_num_samples(X), len(self.classes_))
proba = np.abs(proba, out=proba)
proba /= np.sum(proba, axis=1)[:, np.newaxis]
return proba
def decision_function(self, X):
"""Confidence score.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input data.
Returns
-------
decision : ndarray of shape (n_samples,) if n_classes == 2\
else (n_samples, n_classes)
Confidence score.
"""
if (
self.methods_to_check == "all"
or "decision_function" in self.methods_to_check
):
X, y = self._check_X_y(X)
rng = check_random_state(self.random_state)
if len(self.classes_) == 2:
# for binary classifier, the confidence score is related to
# classes_[1] and therefore should be null.
return rng.randn(_num_samples(X))
else:
return rng.randn(_num_samples(X), len(self.classes_))
def score(self, X=None, Y=None):
"""Fake score.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data, where `n_samples` is the number of samples and
`n_features` is the number of features.
Y : array-like of shape (n_samples, n_output) or (n_samples,)
Target relative to X for classification or regression;
None for unsupervised learning.
Returns
-------
score : float
Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 =>
score=1` otherwise `score=0`).
"""
if self.methods_to_check == "all" or "score" in self.methods_to_check:
self._check_X_y(X, Y)
if self.foo_param > 1:
score = 1.0
else:
score = 0.0
return score
def _more_tags(self):
return {"_skip_test": True, "X_types": ["1dlabel"]}
# Deactivate key validation for CheckingClassifier because we want to be able to
# call fit with arbitrary fit_params and record them. Without this change, we
# would get an error because those arbitrary params are not expected.
CheckingClassifier.set_fit_request = RequestMethod( # type: ignore
name="fit", keys=[], validate_keys=False
)
class NoSampleWeightWrapper(BaseEstimator):
"""Wrap estimator which will not expose `sample_weight`.
Parameters
----------
est : estimator, default=None
The estimator to wrap.
"""
def __init__(self, est=None):
self.est = est
def fit(self, X, y):
return self.est.fit(X, y)
def predict(self, X):
return self.est.predict(X)
def predict_proba(self, X):
return self.est.predict_proba(X)
def _more_tags(self):
return {"_skip_test": True}
def _check_response(method):
def check(self):
return self.response_methods is not None and method in self.response_methods
return check
class _MockEstimatorOnOffPrediction(BaseEstimator):
"""Estimator for which we can turn on/off the prediction methods.
Parameters
----------
response_methods: list of \
{"predict", "predict_proba", "decision_function"}, default=None
List containing the response implemented by the estimator. When, the
response is in the list, it will return the name of the response method
when called. Otherwise, an `AttributeError` is raised. It allows to
use `getattr` as any conventional estimator. By default, no response
methods are mocked.
"""
def __init__(self, response_methods=None):
self.response_methods = response_methods
def fit(self, X, y):
self.classes_ = np.unique(y)
return self
@available_if(_check_response("predict"))
def predict(self, X):
return "predict"
@available_if(_check_response("predict_proba"))
def predict_proba(self, X):
return "predict_proba"
@available_if(_check_response("decision_function"))
def decision_function(self, X):
return "decision_function"
@@ -0,0 +1,33 @@
# Helpers to safely access OpenMP routines
#
# no-op implementations are provided for the case where OpenMP is not available.
#
# All calls to OpenMP routines should be cimported from this module.
cdef extern from *:
"""
#ifdef _OPENMP
#include <omp.h>
#define SKLEARN_OPENMP_PARALLELISM_ENABLED 1
#else
#define SKLEARN_OPENMP_PARALLELISM_ENABLED 0
#define omp_lock_t int
#define omp_init_lock(l) (void)0
#define omp_destroy_lock(l) (void)0
#define omp_set_lock(l) (void)0
#define omp_unset_lock(l) (void)0
#define omp_get_thread_num() 0
#define omp_get_max_threads() 1
#endif
"""
bint SKLEARN_OPENMP_PARALLELISM_ENABLED
ctypedef struct omp_lock_t:
pass
void omp_init_lock(omp_lock_t*) noexcept nogil
void omp_destroy_lock(omp_lock_t*) noexcept nogil
void omp_set_lock(omp_lock_t*) noexcept nogil
void omp_unset_lock(omp_lock_t*) noexcept nogil
int omp_get_thread_num() noexcept nogil
int omp_get_max_threads() noexcept nogil
@@ -0,0 +1,77 @@
import os
from joblib import cpu_count
# Module level cache for cpu_count as we do not expect this to change during
# the lifecycle of a Python program. This dictionary is keyed by
# only_physical_cores.
_CPU_COUNTS = {}
def _openmp_parallelism_enabled():
"""Determines whether scikit-learn has been built with OpenMP
It allows to retrieve at runtime the information gathered at compile time.
"""
# SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time and defined
# in _openmp_helpers.pxd as a boolean. This function exposes it to Python.
return SKLEARN_OPENMP_PARALLELISM_ENABLED
cpdef _openmp_effective_n_threads(n_threads=None, only_physical_cores=True):
"""Determine the effective number of threads to be used for OpenMP calls
- For ``n_threads = None``,
- if the ``OMP_NUM_THREADS`` environment variable is set, return
``openmp.omp_get_max_threads()``
- otherwise, return the minimum between ``openmp.omp_get_max_threads()``
and the number of cpus, taking cgroups quotas into account. Cgroups
quotas can typically be set by tools such as Docker.
The result of ``omp_get_max_threads`` can be influenced by environment
variable ``OMP_NUM_THREADS`` or at runtime by ``omp_set_num_threads``.
- For ``n_threads > 0``, return this as the maximal number of threads for
parallel OpenMP calls.
- For ``n_threads < 0``, return the maximal number of threads minus
``|n_threads + 1|``. In particular ``n_threads = -1`` will use as many
threads as there are available cores on the machine.
- Raise a ValueError for ``n_threads = 0``.
Passing the `only_physical_cores=False` flag makes it possible to use extra
threads for SMT/HyperThreading logical cores. It has been empirically
observed that using as many threads as available SMT cores can slightly
improve the performance in some cases, but can severely degrade
performance other times. Therefore it is recommended to use
`only_physical_cores=True` unless an empirical study has been conducted to
assess the impact of SMT on a case-by-case basis (using various input data
shapes, in particular small data shapes).
If scikit-learn is built without OpenMP support, always return 1.
"""
if n_threads == 0:
raise ValueError("n_threads = 0 is invalid")
if not SKLEARN_OPENMP_PARALLELISM_ENABLED:
# OpenMP disabled at build-time => sequential mode
return 1
if os.getenv("OMP_NUM_THREADS"):
# Fall back to user provided number of threads making it possible
# to exceed the number of cpus.
max_n_threads = omp_get_max_threads()
else:
try:
n_cpus = _CPU_COUNTS[only_physical_cores]
except KeyError:
n_cpus = cpu_count(only_physical_cores=only_physical_cores)
_CPU_COUNTS[only_physical_cores] = n_cpus
max_n_threads = min(omp_get_max_threads(), n_cpus)
if n_threads is None:
return max_n_threads
elif n_threads < 0:
return max(1, max_n_threads + n_threads + 1)
return n_threads
@@ -0,0 +1,42 @@
def check_matplotlib_support(caller_name):
"""Raise ImportError with detailed error message if mpl is not installed.
Plot utilities like any of the Display's plotting functions should lazily import
matplotlib and call this helper before any computation.
Parameters
----------
caller_name : str
The name of the caller that requires matplotlib.
"""
try:
import matplotlib # noqa
except ImportError as e:
raise ImportError(
"{} requires matplotlib. You can install matplotlib with "
"`pip install matplotlib`".format(caller_name)
) from e
def check_pandas_support(caller_name):
"""Raise ImportError with detailed error message if pandas is not installed.
Plot utilities like :func:`fetch_openml` should lazily import
pandas and call this helper before any computation.
Parameters
----------
caller_name : str
The name of the caller that requires pandas.
Returns
-------
pandas
The pandas package.
"""
try:
import pandas # noqa
return pandas
except ImportError as e:
raise ImportError("{} requires pandas.".format(caller_name)) from e
@@ -0,0 +1,905 @@
import functools
import math
import operator
import re
from abc import ABC, abstractmethod
from collections.abc import Iterable
from inspect import signature
from numbers import Integral, Real
import numpy as np
from scipy.sparse import csr_matrix, issparse
from .._config import config_context, get_config
from .validation import _is_arraylike_not_scalar
class InvalidParameterError(ValueError, TypeError):
"""Custom exception to be raised when the parameter of a class/method/function
does not have a valid type or value.
"""
# Inherits from ValueError and TypeError to keep backward compatibility.
def validate_parameter_constraints(parameter_constraints, params, caller_name):
"""Validate types and values of given parameters.
Parameters
----------
parameter_constraints : dict or {"no_validation"}
If "no_validation", validation is skipped for this parameter.
If a dict, it must be a dictionary `param_name: list of constraints`.
A parameter is valid if it satisfies one of the constraints from the list.
Constraints can be:
- an Interval object, representing a continuous or discrete range of numbers
- the string "array-like"
- the string "sparse matrix"
- the string "random_state"
- callable
- None, meaning that None is a valid value for the parameter
- any type, meaning that any instance of this type is valid
- an Options object, representing a set of elements of a given type
- a StrOptions object, representing a set of strings
- the string "boolean"
- the string "verbose"
- the string "cv_object"
- the string "nan"
- a MissingValues object representing markers for missing values
- a HasMethods object, representing method(s) an object must have
- a Hidden object, representing a constraint not meant to be exposed to the user
params : dict
A dictionary `param_name: param_value`. The parameters to validate against the
constraints.
caller_name : str
The name of the estimator or function or method that called this function.
"""
for param_name, param_val in params.items():
# We allow parameters to not have a constraint so that third party estimators
# can inherit from sklearn estimators without having to necessarily use the
# validation tools.
if param_name not in parameter_constraints:
continue
constraints = parameter_constraints[param_name]
if constraints == "no_validation":
continue
constraints = [make_constraint(constraint) for constraint in constraints]
for constraint in constraints:
if constraint.is_satisfied_by(param_val):
# this constraint is satisfied, no need to check further.
break
else:
# No constraint is satisfied, raise with an informative message.
# Ignore constraints that we don't want to expose in the error message,
# i.e. options that are for internal purpose or not officially supported.
constraints = [
constraint for constraint in constraints if not constraint.hidden
]
if len(constraints) == 1:
constraints_str = f"{constraints[0]}"
else:
constraints_str = (
f"{', '.join([str(c) for c in constraints[:-1]])} or"
f" {constraints[-1]}"
)
raise InvalidParameterError(
f"The {param_name!r} parameter of {caller_name} must be"
f" {constraints_str}. Got {param_val!r} instead."
)
def make_constraint(constraint):
"""Convert the constraint into the appropriate Constraint object.
Parameters
----------
constraint : object
The constraint to convert.
Returns
-------
constraint : instance of _Constraint
The converted constraint.
"""
if isinstance(constraint, str) and constraint == "array-like":
return _ArrayLikes()
if isinstance(constraint, str) and constraint == "sparse matrix":
return _SparseMatrices()
if isinstance(constraint, str) and constraint == "random_state":
return _RandomStates()
if constraint is callable:
return _Callables()
if constraint is None:
return _NoneConstraint()
if isinstance(constraint, type):
return _InstancesOf(constraint)
if isinstance(
constraint, (Interval, StrOptions, Options, HasMethods, MissingValues)
):
return constraint
if isinstance(constraint, str) and constraint == "boolean":
return _Booleans()
if isinstance(constraint, str) and constraint == "verbose":
return _VerboseHelper()
if isinstance(constraint, str) and constraint == "cv_object":
return _CVObjects()
if isinstance(constraint, Hidden):
constraint = make_constraint(constraint.constraint)
constraint.hidden = True
return constraint
if isinstance(constraint, str) and constraint == "nan":
return _NanConstraint()
raise ValueError(f"Unknown constraint type: {constraint}")
def validate_params(parameter_constraints, *, prefer_skip_nested_validation):
"""Decorator to validate types and values of functions and methods.
Parameters
----------
parameter_constraints : dict
A dictionary `param_name: list of constraints`. See the docstring of
`validate_parameter_constraints` for a description of the accepted constraints.
Note that the *args and **kwargs parameters are not validated and must not be
present in the parameter_constraints dictionary.
prefer_skip_nested_validation : bool
If True, the validation of parameters of inner estimators or functions
called by the decorated function will be skipped.
This is useful to avoid validating many times the parameters passed by the
user from the public facing API. It's also useful to avoid validating
parameters that we pass internally to inner functions that are guaranteed to
be valid by the test suite.
It should be set to True for most functions, except for those that receive
non-validated objects as parameters or that are just wrappers around classes
because they only perform a partial validation.
Returns
-------
decorated_function : function or method
The decorated function.
"""
def decorator(func):
# The dict of parameter constraints is set as an attribute of the function
# to make it possible to dynamically introspect the constraints for
# automatic testing.
setattr(func, "_skl_parameter_constraints", parameter_constraints)
@functools.wraps(func)
def wrapper(*args, **kwargs):
global_skip_validation = get_config()["skip_parameter_validation"]
if global_skip_validation:
return func(*args, **kwargs)
func_sig = signature(func)
# Map *args/**kwargs to the function signature
params = func_sig.bind(*args, **kwargs)
params.apply_defaults()
# ignore self/cls and positional/keyword markers
to_ignore = [
p.name
for p in func_sig.parameters.values()
if p.kind in (p.VAR_POSITIONAL, p.VAR_KEYWORD)
]
to_ignore += ["self", "cls"]
params = {k: v for k, v in params.arguments.items() if k not in to_ignore}
validate_parameter_constraints(
parameter_constraints, params, caller_name=func.__qualname__
)
try:
with config_context(
skip_parameter_validation=(
prefer_skip_nested_validation or global_skip_validation
)
):
return func(*args, **kwargs)
except InvalidParameterError as e:
# When the function is just a wrapper around an estimator, we allow
# the function to delegate validation to the estimator, but we replace
# the name of the estimator by the name of the function in the error
# message to avoid confusion.
msg = re.sub(
r"parameter of \w+ must be",
f"parameter of {func.__qualname__} must be",
str(e),
)
raise InvalidParameterError(msg) from e
return wrapper
return decorator
class RealNotInt(Real):
"""A type that represents reals that are not instances of int.
Behaves like float, but also works with values extracted from numpy arrays.
isintance(1, RealNotInt) -> False
isinstance(1.0, RealNotInt) -> True
"""
RealNotInt.register(float)
def _type_name(t):
"""Convert type into human readable string."""
module = t.__module__
qualname = t.__qualname__
if module == "builtins":
return qualname
elif t == Real:
return "float"
elif t == Integral:
return "int"
return f"{module}.{qualname}"
class _Constraint(ABC):
"""Base class for the constraint objects."""
def __init__(self):
self.hidden = False
@abstractmethod
def is_satisfied_by(self, val):
"""Whether or not a value satisfies the constraint.
Parameters
----------
val : object
The value to check.
Returns
-------
is_satisfied : bool
Whether or not the constraint is satisfied by this value.
"""
@abstractmethod
def __str__(self):
"""A human readable representational string of the constraint."""
class _InstancesOf(_Constraint):
"""Constraint representing instances of a given type.
Parameters
----------
type : type
The valid type.
"""
def __init__(self, type):
super().__init__()
self.type = type
def is_satisfied_by(self, val):
return isinstance(val, self.type)
def __str__(self):
return f"an instance of {_type_name(self.type)!r}"
class _NoneConstraint(_Constraint):
"""Constraint representing the None singleton."""
def is_satisfied_by(self, val):
return val is None
def __str__(self):
return "None"
class _NanConstraint(_Constraint):
"""Constraint representing the indicator `np.nan`."""
def is_satisfied_by(self, val):
return (
not isinstance(val, Integral) and isinstance(val, Real) and math.isnan(val)
)
def __str__(self):
return "numpy.nan"
class _PandasNAConstraint(_Constraint):
"""Constraint representing the indicator `pd.NA`."""
def is_satisfied_by(self, val):
try:
import pandas as pd
return isinstance(val, type(pd.NA)) and pd.isna(val)
except ImportError:
return False
def __str__(self):
return "pandas.NA"
class Options(_Constraint):
"""Constraint representing a finite set of instances of a given type.
Parameters
----------
type : type
options : set
The set of valid scalars.
deprecated : set or None, default=None
A subset of the `options` to mark as deprecated in the string
representation of the constraint.
"""
def __init__(self, type, options, *, deprecated=None):
super().__init__()
self.type = type
self.options = options
self.deprecated = deprecated or set()
if self.deprecated - self.options:
raise ValueError("The deprecated options must be a subset of the options.")
def is_satisfied_by(self, val):
return isinstance(val, self.type) and val in self.options
def _mark_if_deprecated(self, option):
"""Add a deprecated mark to an option if needed."""
option_str = f"{option!r}"
if option in self.deprecated:
option_str = f"{option_str} (deprecated)"
return option_str
def __str__(self):
options_str = (
f"{', '.join([self._mark_if_deprecated(o) for o in self.options])}"
)
return f"a {_type_name(self.type)} among {{{options_str}}}"
class StrOptions(Options):
"""Constraint representing a finite set of strings.
Parameters
----------
options : set of str
The set of valid strings.
deprecated : set of str or None, default=None
A subset of the `options` to mark as deprecated in the string
representation of the constraint.
"""
def __init__(self, options, *, deprecated=None):
super().__init__(type=str, options=options, deprecated=deprecated)
class Interval(_Constraint):
"""Constraint representing a typed interval.
Parameters
----------
type : {numbers.Integral, numbers.Real, RealNotInt}
The set of numbers in which to set the interval.
If RealNotInt, only reals that don't have the integer type
are allowed. For example 1.0 is allowed but 1 is not.
left : float or int or None
The left bound of the interval. None means left bound is -∞.
right : float, int or None
The right bound of the interval. None means right bound is +∞.
closed : {"left", "right", "both", "neither"}
Whether the interval is open or closed. Possible choices are:
- `"left"`: the interval is closed on the left and open on the right.
It is equivalent to the interval `[ left, right )`.
- `"right"`: the interval is closed on the right and open on the left.
It is equivalent to the interval `( left, right ]`.
- `"both"`: the interval is closed.
It is equivalent to the interval `[ left, right ]`.
- `"neither"`: the interval is open.
It is equivalent to the interval `( left, right )`.
Notes
-----
Setting a bound to `None` and setting the interval closed is valid. For instance,
strictly speaking, `Interval(Real, 0, None, closed="both")` corresponds to
`[0, +∞) U {+∞}`.
"""
def __init__(self, type, left, right, *, closed):
super().__init__()
self.type = type
self.left = left
self.right = right
self.closed = closed
self._check_params()
def _check_params(self):
if self.type not in (Integral, Real, RealNotInt):
raise ValueError(
"type must be either numbers.Integral, numbers.Real or RealNotInt."
f" Got {self.type} instead."
)
if self.closed not in ("left", "right", "both", "neither"):
raise ValueError(
"closed must be either 'left', 'right', 'both' or 'neither'. "
f"Got {self.closed} instead."
)
if self.type is Integral:
suffix = "for an interval over the integers."
if self.left is not None and not isinstance(self.left, Integral):
raise TypeError(f"Expecting left to be an int {suffix}")
if self.right is not None and not isinstance(self.right, Integral):
raise TypeError(f"Expecting right to be an int {suffix}")
if self.left is None and self.closed in ("left", "both"):
raise ValueError(
f"left can't be None when closed == {self.closed} {suffix}"
)
if self.right is None and self.closed in ("right", "both"):
raise ValueError(
f"right can't be None when closed == {self.closed} {suffix}"
)
else:
if self.left is not None and not isinstance(self.left, Real):
raise TypeError("Expecting left to be a real number.")
if self.right is not None and not isinstance(self.right, Real):
raise TypeError("Expecting right to be a real number.")
if self.right is not None and self.left is not None and self.right <= self.left:
raise ValueError(
f"right can't be less than left. Got left={self.left} and "
f"right={self.right}"
)
def __contains__(self, val):
if not isinstance(val, Integral) and np.isnan(val):
return False
left_cmp = operator.lt if self.closed in ("left", "both") else operator.le
right_cmp = operator.gt if self.closed in ("right", "both") else operator.ge
left = -np.inf if self.left is None else self.left
right = np.inf if self.right is None else self.right
if left_cmp(val, left):
return False
if right_cmp(val, right):
return False
return True
def is_satisfied_by(self, val):
if not isinstance(val, self.type):
return False
return val in self
def __str__(self):
type_str = "an int" if self.type is Integral else "a float"
left_bracket = "[" if self.closed in ("left", "both") else "("
left_bound = "-inf" if self.left is None else self.left
right_bound = "inf" if self.right is None else self.right
right_bracket = "]" if self.closed in ("right", "both") else ")"
# better repr if the bounds were given as integers
if not self.type == Integral and isinstance(self.left, Real):
left_bound = float(left_bound)
if not self.type == Integral and isinstance(self.right, Real):
right_bound = float(right_bound)
return (
f"{type_str} in the range "
f"{left_bracket}{left_bound}, {right_bound}{right_bracket}"
)
class _ArrayLikes(_Constraint):
"""Constraint representing array-likes"""
def is_satisfied_by(self, val):
return _is_arraylike_not_scalar(val)
def __str__(self):
return "an array-like"
class _SparseMatrices(_Constraint):
"""Constraint representing sparse matrices."""
def is_satisfied_by(self, val):
return issparse(val)
def __str__(self):
return "a sparse matrix"
class _Callables(_Constraint):
"""Constraint representing callables."""
def is_satisfied_by(self, val):
return callable(val)
def __str__(self):
return "a callable"
class _RandomStates(_Constraint):
"""Constraint representing random states.
Convenience class for
[Interval(Integral, 0, 2**32 - 1, closed="both"), np.random.RandomState, None]
"""
def __init__(self):
super().__init__()
self._constraints = [
Interval(Integral, 0, 2**32 - 1, closed="both"),
_InstancesOf(np.random.RandomState),
_NoneConstraint(),
]
def is_satisfied_by(self, val):
return any(c.is_satisfied_by(val) for c in self._constraints)
def __str__(self):
return (
f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
f" {self._constraints[-1]}"
)
class _Booleans(_Constraint):
"""Constraint representing boolean likes.
Convenience class for
[bool, np.bool_]
"""
def __init__(self):
super().__init__()
self._constraints = [
_InstancesOf(bool),
_InstancesOf(np.bool_),
]
def is_satisfied_by(self, val):
return any(c.is_satisfied_by(val) for c in self._constraints)
def __str__(self):
return (
f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
f" {self._constraints[-1]}"
)
class _VerboseHelper(_Constraint):
"""Helper constraint for the verbose parameter.
Convenience class for
[Interval(Integral, 0, None, closed="left"), bool, numpy.bool_]
"""
def __init__(self):
super().__init__()
self._constraints = [
Interval(Integral, 0, None, closed="left"),
_InstancesOf(bool),
_InstancesOf(np.bool_),
]
def is_satisfied_by(self, val):
return any(c.is_satisfied_by(val) for c in self._constraints)
def __str__(self):
return (
f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
f" {self._constraints[-1]}"
)
class MissingValues(_Constraint):
"""Helper constraint for the `missing_values` parameters.
Convenience for
[
Integral,
Interval(Real, None, None, closed="both"),
str, # when numeric_only is False
None, # when numeric_only is False
_NanConstraint(),
_PandasNAConstraint(),
]
Parameters
----------
numeric_only : bool, default=False
Whether to consider only numeric missing value markers.
"""
def __init__(self, numeric_only=False):
super().__init__()
self.numeric_only = numeric_only
self._constraints = [
_InstancesOf(Integral),
# we use an interval of Real to ignore np.nan that has its own constraint
Interval(Real, None, None, closed="both"),
_NanConstraint(),
_PandasNAConstraint(),
]
if not self.numeric_only:
self._constraints.extend([_InstancesOf(str), _NoneConstraint()])
def is_satisfied_by(self, val):
return any(c.is_satisfied_by(val) for c in self._constraints)
def __str__(self):
return (
f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
f" {self._constraints[-1]}"
)
class HasMethods(_Constraint):
"""Constraint representing objects that expose specific methods.
It is useful for parameters following a protocol and where we don't want to impose
an affiliation to a specific module or class.
Parameters
----------
methods : str or list of str
The method(s) that the object is expected to expose.
"""
@validate_params(
{"methods": [str, list]},
prefer_skip_nested_validation=True,
)
def __init__(self, methods):
super().__init__()
if isinstance(methods, str):
methods = [methods]
self.methods = methods
def is_satisfied_by(self, val):
return all(callable(getattr(val, method, None)) for method in self.methods)
def __str__(self):
if len(self.methods) == 1:
methods = f"{self.methods[0]!r}"
else:
methods = (
f"{', '.join([repr(m) for m in self.methods[:-1]])} and"
f" {self.methods[-1]!r}"
)
return f"an object implementing {methods}"
class _IterablesNotString(_Constraint):
"""Constraint representing iterables that are not strings."""
def is_satisfied_by(self, val):
return isinstance(val, Iterable) and not isinstance(val, str)
def __str__(self):
return "an iterable"
class _CVObjects(_Constraint):
"""Constraint representing cv objects.
Convenient class for
[
Interval(Integral, 2, None, closed="left"),
HasMethods(["split", "get_n_splits"]),
_IterablesNotString(),
None,
]
"""
def __init__(self):
super().__init__()
self._constraints = [
Interval(Integral, 2, None, closed="left"),
HasMethods(["split", "get_n_splits"]),
_IterablesNotString(),
_NoneConstraint(),
]
def is_satisfied_by(self, val):
return any(c.is_satisfied_by(val) for c in self._constraints)
def __str__(self):
return (
f"{', '.join([str(c) for c in self._constraints[:-1]])} or"
f" {self._constraints[-1]}"
)
class Hidden:
"""Class encapsulating a constraint not meant to be exposed to the user.
Parameters
----------
constraint : str or _Constraint instance
The constraint to be used internally.
"""
def __init__(self, constraint):
self.constraint = constraint
def generate_invalid_param_val(constraint):
"""Return a value that does not satisfy the constraint.
Raises a NotImplementedError if there exists no invalid value for this constraint.
This is only useful for testing purpose.
Parameters
----------
constraint : _Constraint instance
The constraint to generate a value for.
Returns
-------
val : object
A value that does not satisfy the constraint.
"""
if isinstance(constraint, StrOptions):
return f"not {' or '.join(constraint.options)}"
if isinstance(constraint, MissingValues):
return np.array([1, 2, 3])
if isinstance(constraint, _VerboseHelper):
return -1
if isinstance(constraint, HasMethods):
return type("HasNotMethods", (), {})()
if isinstance(constraint, _IterablesNotString):
return "a string"
if isinstance(constraint, _CVObjects):
return "not a cv object"
if isinstance(constraint, Interval) and constraint.type is Integral:
if constraint.left is not None:
return constraint.left - 1
if constraint.right is not None:
return constraint.right + 1
# There's no integer outside (-inf, +inf)
raise NotImplementedError
if isinstance(constraint, Interval) and constraint.type in (Real, RealNotInt):
if constraint.left is not None:
return constraint.left - 1e-6
if constraint.right is not None:
return constraint.right + 1e-6
# bounds are -inf, +inf
if constraint.closed in ("right", "neither"):
return -np.inf
if constraint.closed in ("left", "neither"):
return np.inf
# interval is [-inf, +inf]
return np.nan
raise NotImplementedError
def generate_valid_param(constraint):
"""Return a value that does satisfy a constraint.
This is only useful for testing purpose.
Parameters
----------
constraint : Constraint instance
The constraint to generate a value for.
Returns
-------
val : object
A value that does satisfy the constraint.
"""
if isinstance(constraint, _ArrayLikes):
return np.array([1, 2, 3])
if isinstance(constraint, _SparseMatrices):
return csr_matrix([[0, 1], [1, 0]])
if isinstance(constraint, _RandomStates):
return np.random.RandomState(42)
if isinstance(constraint, _Callables):
return lambda x: x
if isinstance(constraint, _NoneConstraint):
return None
if isinstance(constraint, _InstancesOf):
if constraint.type is np.ndarray:
# special case for ndarray since it can't be instantiated without arguments
return np.array([1, 2, 3])
if constraint.type in (Integral, Real):
# special case for Integral and Real since they are abstract classes
return 1
return constraint.type()
if isinstance(constraint, _Booleans):
return True
if isinstance(constraint, _VerboseHelper):
return 1
if isinstance(constraint, MissingValues) and constraint.numeric_only:
return np.nan
if isinstance(constraint, MissingValues) and not constraint.numeric_only:
return "missing"
if isinstance(constraint, HasMethods):
return type(
"ValidHasMethods", (), {m: lambda self: None for m in constraint.methods}
)()
if isinstance(constraint, _IterablesNotString):
return [1, 2, 3]
if isinstance(constraint, _CVObjects):
return 5
if isinstance(constraint, Options): # includes StrOptions
for option in constraint.options:
return option
if isinstance(constraint, Interval):
interval = constraint
if interval.left is None and interval.right is None:
return 0
elif interval.left is None:
return interval.right - 1
elif interval.right is None:
return interval.left + 1
else:
if interval.type is Real:
return (interval.left + interval.right) / 2
else:
return interval.left + 1
raise ValueError(f"Unknown constraint type: {constraint}")
@@ -0,0 +1,99 @@
import numpy as np
from . import check_consistent_length
from ._optional_dependencies import check_matplotlib_support
from ._response import _get_response_values_binary
from .multiclass import type_of_target
from .validation import _check_pos_label_consistency
class _BinaryClassifierCurveDisplayMixin:
"""Mixin class to be used in Displays requiring a binary classifier.
The aim of this class is to centralize some validations regarding the estimator and
the target and gather the response of the estimator.
"""
def _validate_plot_params(self, *, ax=None, name=None):
check_matplotlib_support(f"{self.__class__.__name__}.plot")
import matplotlib.pyplot as plt
if ax is None:
_, ax = plt.subplots()
name = self.estimator_name if name is None else name
return ax, ax.figure, name
@classmethod
def _validate_and_get_response_values(
cls, estimator, X, y, *, response_method="auto", pos_label=None, name=None
):
check_matplotlib_support(f"{cls.__name__}.from_estimator")
name = estimator.__class__.__name__ if name is None else name
y_pred, pos_label = _get_response_values_binary(
estimator,
X,
response_method=response_method,
pos_label=pos_label,
)
return y_pred, pos_label, name
@classmethod
def _validate_from_predictions_params(
cls, y_true, y_pred, *, sample_weight=None, pos_label=None, name=None
):
check_matplotlib_support(f"{cls.__name__}.from_predictions")
if type_of_target(y_true) != "binary":
raise ValueError(
f"The target y is not binary. Got {type_of_target(y_true)} type of"
" target."
)
check_consistent_length(y_true, y_pred, sample_weight)
pos_label = _check_pos_label_consistency(pos_label, y_true)
name = name if name is not None else "Classifier"
return pos_label, name
def _validate_score_name(score_name, scoring, negate_score):
"""Validate the `score_name` parameter.
If `score_name` is provided, we just return it as-is.
If `score_name` is `None`, we use `Score` if `negate_score` is `False` and
`Negative score` otherwise.
If `score_name` is a string or a callable, we infer the name. We replace `_` by
spaces and capitalize the first letter. We remove `neg_` and replace it by
`"Negative"` if `negate_score` is `False` or just remove it otherwise.
"""
if score_name is not None:
return score_name
elif scoring is None:
return "Negative score" if negate_score else "Score"
else:
score_name = scoring.__name__ if callable(scoring) else scoring
if negate_score:
if score_name.startswith("neg_"):
score_name = score_name[4:]
else:
score_name = f"Negative {score_name}"
elif score_name.startswith("neg_"):
score_name = f"Negative {score_name[4:]}"
score_name = score_name.replace("_", " ")
return score_name.capitalize()
def _interval_max_min_ratio(data):
"""Compute the ratio between the largest and smallest inter-point distances.
A value larger than 5 typically indicates that the parameter range would
better be displayed with a log scale while a linear scale would be more
suitable otherwise.
"""
diff = np.diff(np.sort(data))
return diff.max() / diff.min()
@@ -0,0 +1,463 @@
"""This module contains the _EstimatorPrettyPrinter class used in
BaseEstimator.__repr__ for pretty-printing estimators"""
# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
# 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 Python Software Foundation;
# All Rights Reserved
# Authors: Fred L. Drake, Jr. <fdrake@acm.org> (built-in CPython pprint module)
# Nicolas Hug (scikit-learn specific changes)
# License: PSF License version 2 (see below)
# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
# --------------------------------------------
# 1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"),
# and the Individual or Organization ("Licensee") accessing and otherwise
# using this software ("Python") in source or binary form and its associated
# documentation.
# 2. Subject to the terms and conditions of this License Agreement, PSF hereby
# grants Licensee a nonexclusive, royalty-free, world-wide license to
# reproduce, analyze, test, perform and/or display publicly, prepare
# derivative works, distribute, and otherwise use Python alone or in any
# derivative version, provided, however, that PSF's License Agreement and
# PSF's notice of copyright, i.e., "Copyright (c) 2001, 2002, 2003, 2004,
# 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
# 2017, 2018 Python Software Foundation; All Rights Reserved" are retained in
# Python alone or in any derivative version prepared by Licensee.
# 3. In the event Licensee prepares a derivative work that is based on or
# incorporates Python or any part thereof, and wants to make the derivative
# work available to others as provided herein, then Licensee hereby agrees to
# include in any such work a brief summary of the changes made to Python.
# 4. PSF is making Python available to Licensee on an "AS IS" basis. PSF MAKES
# NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT
# NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF
# MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
# PYTHON WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON FOR ANY
# INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF
# MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, OR ANY DERIVATIVE
# THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
# 6. This License Agreement will automatically terminate upon a material
# breach of its terms and conditions.
# 7. Nothing in this License Agreement shall be deemed to create any
# relationship of agency, partnership, or joint venture between PSF and
# Licensee. This License Agreement does not grant permission to use PSF
# trademarks or trade name in a trademark sense to endorse or promote products
# or services of Licensee, or any third party.
# 8. By copying, installing or otherwise using Python, Licensee agrees to be
# bound by the terms and conditions of this License Agreement.
# Brief summary of changes to original code:
# - "compact" parameter is supported for dicts, not just lists or tuples
# - estimators have a custom handler, they're not just treated as objects
# - long sequences (lists, tuples, dict items) with more than N elements are
# shortened using ellipsis (', ...') at the end.
import inspect
import pprint
from collections import OrderedDict
from .._config import get_config
from ..base import BaseEstimator
from ._missing import is_scalar_nan
class KeyValTuple(tuple):
"""Dummy class for correctly rendering key-value tuples from dicts."""
def __repr__(self):
# needed for _dispatch[tuple.__repr__] not to be overridden
return super().__repr__()
class KeyValTupleParam(KeyValTuple):
"""Dummy class for correctly rendering key-value tuples from parameters."""
pass
def _changed_params(estimator):
"""Return dict (param_name: value) of parameters that were given to
estimator with non-default values."""
params = estimator.get_params(deep=False)
init_func = getattr(estimator.__init__, "deprecated_original", estimator.__init__)
init_params = inspect.signature(init_func).parameters
init_params = {name: param.default for name, param in init_params.items()}
def has_changed(k, v):
if k not in init_params: # happens if k is part of a **kwargs
return True
if init_params[k] == inspect._empty: # k has no default value
return True
# try to avoid calling repr on nested estimators
if isinstance(v, BaseEstimator) and v.__class__ != init_params[k].__class__:
return True
# Use repr as a last resort. It may be expensive.
if repr(v) != repr(init_params[k]) and not (
is_scalar_nan(init_params[k]) and is_scalar_nan(v)
):
return True
return False
return {k: v for k, v in params.items() if has_changed(k, v)}
class _EstimatorPrettyPrinter(pprint.PrettyPrinter):
"""Pretty Printer class for estimator objects.
This extends the pprint.PrettyPrinter class, because:
- we need estimators to be printed with their parameters, e.g.
Estimator(param1=value1, ...) which is not supported by default.
- the 'compact' parameter of PrettyPrinter is ignored for dicts, which
may lead to very long representations that we want to avoid.
Quick overview of pprint.PrettyPrinter (see also
https://stackoverflow.com/questions/49565047/pprint-with-hex-numbers):
- the entry point is the _format() method which calls format() (overridden
here)
- format() directly calls _safe_repr() for a first try at rendering the
object
- _safe_repr formats the whole object recursively, only calling itself,
not caring about line length or anything
- back to _format(), if the output string is too long, _format() then calls
the appropriate _pprint_TYPE() method (e.g. _pprint_list()) depending on
the type of the object. This where the line length and the compact
parameters are taken into account.
- those _pprint_TYPE() methods will internally use the format() method for
rendering the nested objects of an object (e.g. the elements of a list)
In the end, everything has to be implemented twice: in _safe_repr and in
the custom _pprint_TYPE methods. Unfortunately PrettyPrinter is really not
straightforward to extend (especially when we want a compact output), so
the code is a bit convoluted.
This class overrides:
- format() to support the changed_only parameter
- _safe_repr to support printing of estimators (for when they fit on a
single line)
- _format_dict_items so that dict are correctly 'compacted'
- _format_items so that ellipsis is used on long lists and tuples
When estimators cannot be printed on a single line, the builtin _format()
will call _pprint_estimator() because it was registered to do so (see
_dispatch[BaseEstimator.__repr__] = _pprint_estimator).
both _format_dict_items() and _pprint_estimator() use the
_format_params_or_dict_items() method that will format parameters and
key-value pairs respecting the compact parameter. This method needs another
subroutine _pprint_key_val_tuple() used when a parameter or a key-value
pair is too long to fit on a single line. This subroutine is called in
_format() and is registered as well in the _dispatch dict (just like
_pprint_estimator). We had to create the two classes KeyValTuple and
KeyValTupleParam for this.
"""
def __init__(
self,
indent=1,
width=80,
depth=None,
stream=None,
*,
compact=False,
indent_at_name=True,
n_max_elements_to_show=None,
):
super().__init__(indent, width, depth, stream, compact=compact)
self._indent_at_name = indent_at_name
if self._indent_at_name:
self._indent_per_level = 1 # ignore indent param
self._changed_only = get_config()["print_changed_only"]
# Max number of elements in a list, dict, tuple until we start using
# ellipsis. This also affects the number of arguments of an estimators
# (they are treated as dicts)
self.n_max_elements_to_show = n_max_elements_to_show
def format(self, object, context, maxlevels, level):
return _safe_repr(
object, context, maxlevels, level, changed_only=self._changed_only
)
def _pprint_estimator(self, object, stream, indent, allowance, context, level):
stream.write(object.__class__.__name__ + "(")
if self._indent_at_name:
indent += len(object.__class__.__name__)
if self._changed_only:
params = _changed_params(object)
else:
params = object.get_params(deep=False)
params = OrderedDict((name, val) for (name, val) in sorted(params.items()))
self._format_params(
params.items(), stream, indent, allowance + 1, context, level
)
stream.write(")")
def _format_dict_items(self, items, stream, indent, allowance, context, level):
return self._format_params_or_dict_items(
items, stream, indent, allowance, context, level, is_dict=True
)
def _format_params(self, items, stream, indent, allowance, context, level):
return self._format_params_or_dict_items(
items, stream, indent, allowance, context, level, is_dict=False
)
def _format_params_or_dict_items(
self, object, stream, indent, allowance, context, level, is_dict
):
"""Format dict items or parameters respecting the compact=True
parameter. For some reason, the builtin rendering of dict items doesn't
respect compact=True and will use one line per key-value if all cannot
fit in a single line.
Dict items will be rendered as <'key': value> while params will be
rendered as <key=value>. The implementation is mostly copy/pasting from
the builtin _format_items().
This also adds ellipsis if the number of items is greater than
self.n_max_elements_to_show.
"""
write = stream.write
indent += self._indent_per_level
delimnl = ",\n" + " " * indent
delim = ""
width = max_width = self._width - indent + 1
it = iter(object)
try:
next_ent = next(it)
except StopIteration:
return
last = False
n_items = 0
while not last:
if n_items == self.n_max_elements_to_show:
write(", ...")
break
n_items += 1
ent = next_ent
try:
next_ent = next(it)
except StopIteration:
last = True
max_width -= allowance
width -= allowance
if self._compact:
k, v = ent
krepr = self._repr(k, context, level)
vrepr = self._repr(v, context, level)
if not is_dict:
krepr = krepr.strip("'")
middle = ": " if is_dict else "="
rep = krepr + middle + vrepr
w = len(rep) + 2
if width < w:
width = max_width
if delim:
delim = delimnl
if width >= w:
width -= w
write(delim)
delim = ", "
write(rep)
continue
write(delim)
delim = delimnl
class_ = KeyValTuple if is_dict else KeyValTupleParam
self._format(
class_(ent), stream, indent, allowance if last else 1, context, level
)
def _format_items(self, items, stream, indent, allowance, context, level):
"""Format the items of an iterable (list, tuple...). Same as the
built-in _format_items, with support for ellipsis if the number of
elements is greater than self.n_max_elements_to_show.
"""
write = stream.write
indent += self._indent_per_level
if self._indent_per_level > 1:
write((self._indent_per_level - 1) * " ")
delimnl = ",\n" + " " * indent
delim = ""
width = max_width = self._width - indent + 1
it = iter(items)
try:
next_ent = next(it)
except StopIteration:
return
last = False
n_items = 0
while not last:
if n_items == self.n_max_elements_to_show:
write(", ...")
break
n_items += 1
ent = next_ent
try:
next_ent = next(it)
except StopIteration:
last = True
max_width -= allowance
width -= allowance
if self._compact:
rep = self._repr(ent, context, level)
w = len(rep) + 2
if width < w:
width = max_width
if delim:
delim = delimnl
if width >= w:
width -= w
write(delim)
delim = ", "
write(rep)
continue
write(delim)
delim = delimnl
self._format(ent, stream, indent, allowance if last else 1, context, level)
def _pprint_key_val_tuple(self, object, stream, indent, allowance, context, level):
"""Pretty printing for key-value tuples from dict or parameters."""
k, v = object
rep = self._repr(k, context, level)
if isinstance(object, KeyValTupleParam):
rep = rep.strip("'")
middle = "="
else:
middle = ": "
stream.write(rep)
stream.write(middle)
self._format(
v, stream, indent + len(rep) + len(middle), allowance, context, level
)
# Note: need to copy _dispatch to prevent instances of the builtin
# PrettyPrinter class to call methods of _EstimatorPrettyPrinter (see issue
# 12906)
# mypy error: "Type[PrettyPrinter]" has no attribute "_dispatch"
_dispatch = pprint.PrettyPrinter._dispatch.copy() # type: ignore
_dispatch[BaseEstimator.__repr__] = _pprint_estimator
_dispatch[KeyValTuple.__repr__] = _pprint_key_val_tuple
def _safe_repr(object, context, maxlevels, level, changed_only=False):
"""Same as the builtin _safe_repr, with added support for Estimator
objects."""
typ = type(object)
if typ in pprint._builtin_scalars:
return repr(object), True, False
r = getattr(typ, "__repr__", None)
if issubclass(typ, dict) and r is dict.__repr__:
if not object:
return "{}", True, False
objid = id(object)
if maxlevels and level >= maxlevels:
return "{...}", False, objid in context
if objid in context:
return pprint._recursion(object), False, True
context[objid] = 1
readable = True
recursive = False
components = []
append = components.append
level += 1
saferepr = _safe_repr
items = sorted(object.items(), key=pprint._safe_tuple)
for k, v in items:
krepr, kreadable, krecur = saferepr(
k, context, maxlevels, level, changed_only=changed_only
)
vrepr, vreadable, vrecur = saferepr(
v, context, maxlevels, level, changed_only=changed_only
)
append("%s: %s" % (krepr, vrepr))
readable = readable and kreadable and vreadable
if krecur or vrecur:
recursive = True
del context[objid]
return "{%s}" % ", ".join(components), readable, recursive
if (issubclass(typ, list) and r is list.__repr__) or (
issubclass(typ, tuple) and r is tuple.__repr__
):
if issubclass(typ, list):
if not object:
return "[]", True, False
format = "[%s]"
elif len(object) == 1:
format = "(%s,)"
else:
if not object:
return "()", True, False
format = "(%s)"
objid = id(object)
if maxlevels and level >= maxlevels:
return format % "...", False, objid in context
if objid in context:
return pprint._recursion(object), False, True
context[objid] = 1
readable = True
recursive = False
components = []
append = components.append
level += 1
for o in object:
orepr, oreadable, orecur = _safe_repr(
o, context, maxlevels, level, changed_only=changed_only
)
append(orepr)
if not oreadable:
readable = False
if orecur:
recursive = True
del context[objid]
return format % ", ".join(components), readable, recursive
if issubclass(typ, BaseEstimator):
objid = id(object)
if maxlevels and level >= maxlevels:
return "{...}", False, objid in context
if objid in context:
return pprint._recursion(object), False, True
context[objid] = 1
readable = True
recursive = False
if changed_only:
params = _changed_params(object)
else:
params = object.get_params(deep=False)
components = []
append = components.append
level += 1
saferepr = _safe_repr
items = sorted(params.items(), key=pprint._safe_tuple)
for k, v in items:
krepr, kreadable, krecur = saferepr(
k, context, maxlevels, level, changed_only=changed_only
)
vrepr, vreadable, vrecur = saferepr(
v, context, maxlevels, level, changed_only=changed_only
)
append("%s=%s" % (krepr.strip("'"), vrepr))
readable = readable and kreadable and vreadable
if krecur or vrecur:
recursive = True
del context[objid]
return ("%s(%s)" % (typ.__name__, ", ".join(components)), readable, recursive)
rep = repr(object)
return rep, (rep and not rep.startswith("<")), False
@@ -0,0 +1,35 @@
# Authors: Arnaud Joly
#
# License: BSD 3 clause
from ._typedefs cimport uint32_t
cdef inline uint32_t DEFAULT_SEED = 1
cdef enum:
# Max value for our rand_r replacement (near the bottom).
# We don't use RAND_MAX because it's different across platforms and
# particularly tiny on Windows/MSVC.
# It corresponds to the maximum representable value for
# 32-bit signed integers (i.e. 2^31 - 1).
RAND_R_MAX = 2147483647
# rand_r replacement using a 32bit XorShift generator
# See http://www.jstatsoft.org/v08/i14/paper for details
cdef inline uint32_t our_rand_r(uint32_t* seed) nogil:
"""Generate a pseudo-random np.uint32 from a np.uint32 seed"""
# seed shouldn't ever be 0.
if (seed[0] == 0):
seed[0] = DEFAULT_SEED
seed[0] ^= <uint32_t>(seed[0] << 13)
seed[0] ^= <uint32_t>(seed[0] >> 17)
seed[0] ^= <uint32_t>(seed[0] << 5)
# Use the modulo to make sure that we don't return a values greater than the
# maximum representable value for signed 32bit integers (i.e. 2^31 - 1).
# Note that the parenthesis are needed to avoid overflow: here
# RAND_R_MAX is cast to uint32_t before 1 is added.
return seed[0] % ((<uint32_t>RAND_R_MAX) + 1)
@@ -0,0 +1,355 @@
# Author: Arnaud Joly
#
# License: BSD 3 clause
"""
Random utility function
=======================
This module complements missing features of ``numpy.random``.
The module contains:
* Several algorithms to sample integers without replacement.
* Fast rand_r alternative based on xor shifts
"""
import numpy as np
from . import check_random_state
from ._typedefs cimport intp_t
cdef uint32_t DEFAULT_SEED = 1
# Compatibility type to always accept the default int type used by NumPy, both
# before and after NumPy 2. On Windows, `long` does not always match `inp_t`.
# See the comments in the `sample_without_replacement` Python function for more
# details.
ctypedef fused default_int:
intp_t
long
cpdef _sample_without_replacement_check_input(default_int n_population,
default_int n_samples):
""" Check that input are consistent for sample_without_replacement"""
if n_population < 0:
raise ValueError('n_population should be greater than 0, got %s.'
% n_population)
if n_samples > n_population:
raise ValueError('n_population should be greater or equal than '
'n_samples, got n_samples > n_population (%s > %s)'
% (n_samples, n_population))
cpdef _sample_without_replacement_with_tracking_selection(
default_int n_population,
default_int n_samples,
random_state=None):
r"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Time complexity:
- Worst-case: unbounded
- Average-case:
O(O(np.random.randint) * \sum_{i=1}^n_samples 1 /
(1 - i / n_population)))
<= O(O(np.random.randint) *
n_population * ln((n_population - 2)
/(n_population - 1 - n_samples)))
<= O(O(np.random.randint) *
n_population * 1 / (1 - n_samples / n_population))
Space complexity of O(n_samples) in a python set.
Parameters
----------
n_population : int
The size of the set to sample from.
n_samples : int
The number of integer to sample.
random_state : int, RandomState instance or None, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
out : ndarray of shape (n_samples,)
The sampled subsets of integer.
"""
_sample_without_replacement_check_input(n_population, n_samples)
cdef default_int i
cdef default_int j
cdef default_int[::1] out = np.empty((n_samples, ), dtype=int)
rng = check_random_state(random_state)
rng_randint = rng.randint
# The following line of code are heavily inspired from python core,
# more precisely of random.sample.
cdef set selected = set()
for i in range(n_samples):
j = rng_randint(n_population)
while j in selected:
j = rng_randint(n_population)
selected.add(j)
out[i] = j
return np.asarray(out)
cpdef _sample_without_replacement_with_pool(default_int n_population,
default_int n_samples,
random_state=None):
"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Time complexity: O(n_population + O(np.random.randint) * n_samples)
Space complexity of O(n_population + n_samples).
Parameters
----------
n_population : int
The size of the set to sample from.
n_samples : int
The number of integer to sample.
random_state : int, RandomState instance or None, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
out : ndarray of shape (n_samples,)
The sampled subsets of integer.
"""
_sample_without_replacement_check_input(n_population, n_samples)
cdef default_int i
cdef default_int j
cdef default_int[::1] out = np.empty((n_samples,), dtype=int)
cdef default_int[::1] pool = np.empty((n_population,), dtype=int)
rng = check_random_state(random_state)
rng_randint = rng.randint
# Initialize the pool
for i in range(n_population):
pool[i] = i
# The following line of code are heavily inspired from python core,
# more precisely of random.sample.
for i in range(n_samples):
j = rng_randint(n_population - i) # invariant: non-selected at [0,n-i)
out[i] = pool[j]
pool[j] = pool[n_population - i - 1] # move non-selected item into vacancy
return np.asarray(out)
cpdef _sample_without_replacement_with_reservoir_sampling(
default_int n_population,
default_int n_samples,
random_state=None
):
"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Time complexity of
O((n_population - n_samples) * O(np.random.randint) + n_samples)
Space complexity of O(n_samples)
Parameters
----------
n_population : int
The size of the set to sample from.
n_samples : int
The number of integer to sample.
random_state : int, RandomState instance or None, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
out : ndarray of shape (n_samples,)
The sampled subsets of integer. The order of the items is not
necessarily random. Use a random permutation of the array if the order
of the items has to be randomized.
"""
_sample_without_replacement_check_input(n_population, n_samples)
cdef default_int i
cdef default_int j
cdef default_int[::1] out = np.empty((n_samples, ), dtype=int)
rng = check_random_state(random_state)
rng_randint = rng.randint
# This cython implementation is based on the one of Robert Kern:
# http://mail.scipy.org/pipermail/numpy-discussion/2010-December/
# 054289.html
#
for i in range(n_samples):
out[i] = i
for i from n_samples <= i < n_population:
j = rng_randint(0, i + 1)
if j < n_samples:
out[j] = i
return np.asarray(out)
cdef _sample_without_replacement(default_int n_population,
default_int n_samples,
method="auto",
random_state=None):
"""Sample integers without replacement.
Private function for the implementation, see sample_without_replacement
documentation for more details.
"""
_sample_without_replacement_check_input(n_population, n_samples)
all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
ratio = <double> n_samples / n_population if n_population != 0.0 else 1.0
# Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99
if method == "auto" and ratio > 0.01 and ratio < 0.99:
rng = check_random_state(random_state)
return rng.permutation(n_population)[:n_samples]
if method == "auto" or method == "tracking_selection":
# TODO the pool based method can also be used.
# however, it requires special benchmark to take into account
# the memory requirement of the array vs the set.
# The value 0.2 has been determined through benchmarking.
if ratio < 0.2:
return _sample_without_replacement_with_tracking_selection(
n_population, n_samples, random_state)
else:
return _sample_without_replacement_with_reservoir_sampling(
n_population, n_samples, random_state)
elif method == "reservoir_sampling":
return _sample_without_replacement_with_reservoir_sampling(
n_population, n_samples, random_state)
elif method == "pool":
return _sample_without_replacement_with_pool(n_population, n_samples,
random_state)
else:
raise ValueError('Expected a method name in %s, got %s. '
% (all_methods, method))
def sample_without_replacement(
object n_population, object n_samples, method="auto", random_state=None):
"""Sample integers without replacement.
Select n_samples integers from the set [0, n_population) without
replacement.
Parameters
----------
n_population : int
The size of the set to sample from.
n_samples : int
The number of integer to sample.
random_state : int, RandomState instance or None, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
method : {"auto", "tracking_selection", "reservoir_sampling", "pool"}, \
default='auto'
If method == "auto", the ratio of n_samples / n_population is used
to determine which algorithm to use:
If ratio is between 0 and 0.01, tracking selection is used.
If ratio is between 0.01 and 0.99, numpy.random.permutation is used.
If ratio is greater than 0.99, reservoir sampling is used.
The order of the selected integers is undefined. If a random order is
desired, the selected subset should be shuffled.
If method =="tracking_selection", a set based implementation is used
which is suitable for `n_samples` <<< `n_population`.
If method == "reservoir_sampling", a reservoir sampling algorithm is
used which is suitable for high memory constraint or when
O(`n_samples`) ~ O(`n_population`).
The order of the selected integers is undefined. If a random order is
desired, the selected subset should be shuffled.
If method == "pool", a pool based algorithm is particularly fast, even
faster than the tracking selection method. However, a vector containing
the entire population has to be initialized.
If n_samples ~ n_population, the reservoir sampling method is faster.
Returns
-------
out : ndarray of shape (n_samples,)
The sampled subsets of integer. The subset of selected integer might
not be randomized, see the method argument.
Examples
--------
>>> from sklearn.utils.random import sample_without_replacement
>>> sample_without_replacement(10, 5, random_state=42)
array([8, 1, 5, 0, 7])
"""
cdef:
intp_t n_pop_intp, n_samples_intp
long n_pop_long, n_samples_long
# On most platforms `np.int_ is np.intp`. However, before NumPy 2 the
# default integer `np.int_` was a long which is 32bit on 64bit windows
# while `intp` is 64bit on 64bit platforms and 32bit on 32bit ones.
if np.int_ is np.intp:
# Branch always taken on NumPy >=2 (or when not on 64bit windows).
# Cython has different rules for conversion of values to integers.
# For NumPy <1.26.2 AND Cython 3, this first branch requires `int()`
# called explicitly to allow e.g. floats.
n_pop_intp = int(n_population)
n_samples_intp = int(n_samples)
return _sample_without_replacement(
n_pop_intp, n_samples_intp, method, random_state)
else:
# Branch taken on 64bit windows with Numpy<2.0 where `long` is 32bit
n_pop_long = n_population
n_samples_long = n_samples
return _sample_without_replacement(
n_pop_long, n_samples_long, method, random_state)
def _our_rand_r_py(seed):
"""Python utils to test the our_rand_r function"""
cdef uint32_t my_seed = seed
return our_rand_r(&my_seed)
@@ -0,0 +1,314 @@
"""Utilities to get the response values of a classifier or a regressor.
It allows to make uniform checks and validation.
"""
import numpy as np
from ..base import is_classifier
from .multiclass import type_of_target
from .validation import _check_response_method, check_is_fitted
def _process_predict_proba(*, y_pred, target_type, classes, pos_label):
"""Get the response values when the response method is `predict_proba`.
This function process the `y_pred` array in the binary and multi-label cases.
In the binary case, it selects the column corresponding to the positive
class. In the multi-label case, it stacks the predictions if they are not
in the "compressed" format `(n_samples, n_outputs)`.
Parameters
----------
y_pred : ndarray
Output of `estimator.predict_proba`. The shape depends on the target type:
- for binary classification, it is a 2d array of shape `(n_samples, 2)`;
- for multiclass classification, it is a 2d array of shape
`(n_samples, n_classes)`;
- for multilabel classification, it is either a list of 2d arrays of shape
`(n_samples, 2)` (e.g. `RandomForestClassifier` or `KNeighborsClassifier`) or
an array of shape `(n_samples, n_outputs)` (e.g. `MLPClassifier` or
`RidgeClassifier`).
target_type : {"binary", "multiclass", "multilabel-indicator"}
Type of the target.
classes : ndarray of shape (n_classes,) or list of such arrays
Class labels as reported by `estimator.classes_`.
pos_label : int, float, bool or str
Only used with binary and multiclass targets.
Returns
-------
y_pred : ndarray of shape (n_samples,), (n_samples, n_classes) or \
(n_samples, n_output)
Compressed predictions format as requested by the metrics.
"""
if target_type == "binary" and y_pred.shape[1] < 2:
# We don't handle classifiers trained on a single class.
raise ValueError(
f"Got predict_proba of shape {y_pred.shape}, but need "
"classifier with two classes."
)
if target_type == "binary":
col_idx = np.flatnonzero(classes == pos_label)[0]
return y_pred[:, col_idx]
elif target_type == "multilabel-indicator":
# Use a compress format of shape `(n_samples, n_output)`.
# Only `MLPClassifier` and `RidgeClassifier` return an array of shape
# `(n_samples, n_outputs)`.
if isinstance(y_pred, list):
# list of arrays of shape `(n_samples, 2)`
return np.vstack([p[:, -1] for p in y_pred]).T
else:
# array of shape `(n_samples, n_outputs)`
return y_pred
return y_pred
def _process_decision_function(*, y_pred, target_type, classes, pos_label):
"""Get the response values when the response method is `decision_function`.
This function process the `y_pred` array in the binary and multi-label cases.
In the binary case, it inverts the sign of the score if the positive label
is not `classes[1]`. In the multi-label case, it stacks the predictions if
they are not in the "compressed" format `(n_samples, n_outputs)`.
Parameters
----------
y_pred : ndarray
Output of `estimator.predict_proba`. The shape depends on the target type:
- for binary classification, it is a 1d array of shape `(n_samples,)` where the
sign is assuming that `classes[1]` is the positive class;
- for multiclass classification, it is a 2d array of shape
`(n_samples, n_classes)`;
- for multilabel classification, it is a 2d array of shape `(n_samples,
n_outputs)`.
target_type : {"binary", "multiclass", "multilabel-indicator"}
Type of the target.
classes : ndarray of shape (n_classes,) or list of such arrays
Class labels as reported by `estimator.classes_`.
pos_label : int, float, bool or str
Only used with binary and multiclass targets.
Returns
-------
y_pred : ndarray of shape (n_samples,), (n_samples, n_classes) or \
(n_samples, n_output)
Compressed predictions format as requested by the metrics.
"""
if target_type == "binary" and pos_label == classes[0]:
return -1 * y_pred
return y_pred
def _get_response_values(
estimator,
X,
response_method,
pos_label=None,
return_response_method_used=False,
):
"""Compute the response values of a classifier, an outlier detector, or a regressor.
The response values are predictions such that it follows the following shape:
- for binary classification, it is a 1d array of shape `(n_samples,)`;
- for multiclass classification, it is a 2d array of shape `(n_samples, n_classes)`;
- for multilabel classification, it is a 2d array of shape `(n_samples, n_outputs)`;
- for outlier detection, it is a 1d array of shape `(n_samples,)`;
- for regression, it is a 1d array of shape `(n_samples,)`.
If `estimator` is a binary classifier, also return the label for the
effective positive class.
This utility is used primarily in the displays and the scikit-learn scorers.
.. versionadded:: 1.3
Parameters
----------
estimator : estimator instance
Fitted classifier, outlier detector, or regressor or a
fitted :class:`~sklearn.pipeline.Pipeline` in which the last estimator is a
classifier, an outlier detector, or a regressor.
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input values.
response_method : {"predict_proba", "predict_log_proba", "decision_function", \
"predict"} or list of such str
Specifies the response method to use get prediction from an estimator
(i.e. :term:`predict_proba`, :term:`predict_log_proba`,
:term:`decision_function` or :term:`predict`). Possible choices are:
- if `str`, it corresponds to the name to the method to return;
- if a list of `str`, it provides the method names in order of
preference. The method returned corresponds to the first method in
the list and which is implemented by `estimator`.
pos_label : int, float, bool or str, default=None
The class considered as the positive class when computing
the metrics. If `None` and target is 'binary', `estimators.classes_[1]` is
considered as the positive class.
return_response_method_used : bool, default=False
Whether to return the response method used to compute the response
values.
.. versionadded:: 1.4
Returns
-------
y_pred : ndarray of shape (n_samples,), (n_samples, n_classes) or \
(n_samples, n_outputs)
Target scores calculated from the provided `response_method`
and `pos_label`.
pos_label : int, float, bool, str or None
The class considered as the positive class when computing
the metrics. Returns `None` if `estimator` is a regressor or an outlier
detector.
response_method_used : str
The response method used to compute the response values. Only returned
if `return_response_method_used` is `True`.
.. versionadded:: 1.4
Raises
------
ValueError
If `pos_label` is not a valid label.
If the shape of `y_pred` is not consistent for binary classifier.
If the response method can be applied to a classifier only and
`estimator` is a regressor.
"""
from sklearn.base import is_classifier, is_outlier_detector # noqa
if is_classifier(estimator):
prediction_method = _check_response_method(estimator, response_method)
classes = estimator.classes_
target_type = type_of_target(classes)
if target_type in ("binary", "multiclass"):
if pos_label is not None and pos_label not in classes.tolist():
raise ValueError(
f"pos_label={pos_label} is not a valid label: It should be "
f"one of {classes}"
)
elif pos_label is None and target_type == "binary":
pos_label = classes[-1]
y_pred = prediction_method(X)
if prediction_method.__name__ in ("predict_proba", "predict_log_proba"):
y_pred = _process_predict_proba(
y_pred=y_pred,
target_type=target_type,
classes=classes,
pos_label=pos_label,
)
elif prediction_method.__name__ == "decision_function":
y_pred = _process_decision_function(
y_pred=y_pred,
target_type=target_type,
classes=classes,
pos_label=pos_label,
)
elif is_outlier_detector(estimator):
prediction_method = _check_response_method(estimator, response_method)
y_pred, pos_label = prediction_method(X), None
else: # estimator is a regressor
if response_method != "predict":
raise ValueError(
f"{estimator.__class__.__name__} should either be a classifier to be "
f"used with response_method={response_method} or the response_method "
"should be 'predict'. Got a regressor with response_method="
f"{response_method} instead."
)
prediction_method = estimator.predict
y_pred, pos_label = prediction_method(X), None
if return_response_method_used:
return y_pred, pos_label, prediction_method.__name__
return y_pred, pos_label
def _get_response_values_binary(
estimator, X, response_method, pos_label=None, return_response_method_used=False
):
"""Compute the response values of a binary classifier.
Parameters
----------
estimator : estimator instance
Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
in which the last estimator is a binary classifier.
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input values.
response_method : {'auto', 'predict_proba', 'decision_function'}
Specifies whether to use :term:`predict_proba` or
:term:`decision_function` as the target response. If set to 'auto',
:term:`predict_proba` is tried first and if it does not exist
:term:`decision_function` is tried next.
pos_label : int, float, bool or str, default=None
The class considered as the positive class when computing
the metrics. By default, `estimators.classes_[1]` is
considered as the positive class.
return_response_method_used : bool, default=False
Whether to return the response method used to compute the response
values.
.. versionadded:: 1.5
Returns
-------
y_pred : ndarray of shape (n_samples,)
Target scores calculated from the provided response_method
and pos_label.
pos_label : int, float, bool or str
The class considered as the positive class when computing
the metrics.
response_method_used : str
The response method used to compute the response values. Only returned
if `return_response_method_used` is `True`.
.. versionadded:: 1.5
"""
classification_error = "Expected 'estimator' to be a binary classifier."
check_is_fitted(estimator)
if not is_classifier(estimator):
raise ValueError(
classification_error + f" Got {estimator.__class__.__name__} instead."
)
elif len(estimator.classes_) != 2:
raise ValueError(
classification_error + f" Got {len(estimator.classes_)} classes instead."
)
if response_method == "auto":
response_method = ["predict_proba", "decision_function"]
return _get_response_values(
estimator,
X,
response_method,
pos_label=pos_label,
return_response_method_used=return_response_method_used,
)
@@ -0,0 +1,76 @@
{{py:
"""
Dataset abstractions for sequential data access.
Template file for easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
Generated file: _seq_dataset.pxd
Each class is duplicated for all dtypes (float and double). The keywords
between double braces are substituted in setup.py.
"""
# name_suffix, c_type
dtypes = [('64', 'float64_t'),
('32', 'float32_t')]
}}
"""Dataset abstractions for sequential data access."""
from ._typedefs cimport float32_t, float64_t, intp_t, uint32_t
# SequentialDataset and its two concrete subclasses are (optionally randomized)
# iterators over the rows of a matrix X and corresponding target values y.
{{for name_suffix, c_type in dtypes}}
#------------------------------------------------------------------------------
cdef class SequentialDataset{{name_suffix}}:
cdef int current_index
cdef int[::1] index
cdef int *index_data_ptr
cdef Py_ssize_t n_samples
cdef uint32_t seed
cdef void shuffle(self, uint32_t seed) noexcept nogil
cdef int _get_next_index(self) noexcept nogil
cdef int _get_random_index(self) noexcept nogil
cdef void _sample(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight,
int current_index) noexcept nogil
cdef void next(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) noexcept nogil
cdef int random(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) noexcept nogil
cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
cdef const {{c_type}}[:, ::1] X
cdef const {{c_type}}[::1] Y
cdef const {{c_type}}[::1] sample_weights
cdef Py_ssize_t n_features
cdef intp_t X_stride
cdef {{c_type}} *X_data_ptr
cdef {{c_type}} *Y_data_ptr
cdef const int[::1] feature_indices
cdef int *feature_indices_ptr
cdef {{c_type}} *sample_weight_data
cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
cdef const {{c_type}}[::1] X_data
cdef const int[::1] X_indptr
cdef const int[::1] X_indices
cdef const {{c_type}}[::1] Y
cdef const {{c_type}}[::1] sample_weights
cdef {{c_type}} *X_data_ptr
cdef int *X_indptr_ptr
cdef int *X_indices_ptr
cdef {{c_type}} *Y_data_ptr
cdef {{c_type}} *sample_weight_data
{{endfor}}
@@ -0,0 +1,351 @@
{{py:
"""
Dataset abstractions for sequential data access.
Template file for easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
Generated file: _seq_dataset.pyx
Each class is duplicated for all dtypes (float and double). The keywords
between double braces are substituted in setup.py.
Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
Arthur Imbert <arthurimbert05@gmail.com>
Joan Massich <mailsik@gmail.com>
License: BSD 3 clause
"""
# name_suffix, c_type, np_type
dtypes = [('64', 'float64_t', 'np.float64'),
('32', 'float32_t', 'np.float32')]
}}
"""Dataset abstractions for sequential data access."""
import numpy as np
cimport cython
from libc.limits cimport INT_MAX
from ._random cimport our_rand_r
from ._typedefs cimport float32_t, float64_t, uint32_t
{{for name_suffix, c_type, np_type in dtypes}}
#------------------------------------------------------------------------------
cdef class SequentialDataset{{name_suffix}}:
"""Base class for datasets with sequential data access.
SequentialDataset is used to iterate over the rows of a matrix X and
corresponding target values y, i.e. to iterate over samples.
There are two methods to get the next sample:
- next : Iterate sequentially (optionally randomized)
- random : Iterate randomly (with replacement)
Attributes
----------
index : np.ndarray
Index array for fast shuffling.
index_data_ptr : int
Pointer to the index array.
current_index : int
Index of current sample in ``index``.
The index of current sample in the data is given by
index_data_ptr[current_index].
n_samples : Py_ssize_t
Number of samples in the dataset.
seed : uint32_t
Seed used for random sampling. This attribute is modified at each call to the
`random` method.
"""
cdef void next(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) noexcept nogil:
"""Get the next example ``x`` from the dataset.
This method gets the next sample looping sequentially over all samples.
The order can be shuffled with the method ``shuffle``.
Shuffling once before iterating over all samples corresponds to a
random draw without replacement. It is used for instance in SGD solver.
Parameters
----------
x_data_ptr : {{c_type}}**
A pointer to the {{c_type}} array which holds the feature
values of the next example.
x_ind_ptr : np.intc**
A pointer to the int array which holds the feature
indices of the next example.
nnz : int*
A pointer to an int holding the number of non-zero
values of the next example.
y : {{c_type}}*
The target value of the next example.
sample_weight : {{c_type}}*
The weight of the next example.
"""
cdef int current_index = self._get_next_index()
self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
current_index)
cdef int random(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight) noexcept nogil:
"""Get a random example ``x`` from the dataset.
This method gets next sample chosen randomly over a uniform
distribution. It corresponds to a random draw with replacement.
It is used for instance in SAG solver.
Parameters
----------
x_data_ptr : {{c_type}}**
A pointer to the {{c_type}} array which holds the feature
values of the next example.
x_ind_ptr : np.intc**
A pointer to the int array which holds the feature
indices of the next example.
nnz : int*
A pointer to an int holding the number of non-zero
values of the next example.
y : {{c_type}}*
The target value of the next example.
sample_weight : {{c_type}}*
The weight of the next example.
Returns
-------
current_index : int
Index of current sample.
"""
cdef int current_index = self._get_random_index()
self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
current_index)
return current_index
cdef void shuffle(self, uint32_t seed) noexcept nogil:
"""Permutes the ordering of examples."""
# Fisher-Yates shuffle
cdef int *ind = self.index_data_ptr
cdef int n = self.n_samples
cdef unsigned i, j
for i in range(n - 1):
j = i + our_rand_r(&seed) % (n - i)
ind[i], ind[j] = ind[j], ind[i]
cdef int _get_next_index(self) noexcept nogil:
cdef int current_index = self.current_index
if current_index >= (self.n_samples - 1):
current_index = -1
current_index += 1
self.current_index = current_index
return self.current_index
cdef int _get_random_index(self) noexcept nogil:
cdef int n = self.n_samples
cdef int current_index = our_rand_r(&self.seed) % n
self.current_index = current_index
return current_index
cdef void _sample(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight,
int current_index) noexcept nogil:
pass
def _shuffle_py(self, uint32_t seed):
"""python function used for easy testing"""
self.shuffle(seed)
def _next_py(self):
"""python function used for easy testing"""
cdef int current_index = self._get_next_index()
return self._sample_py(current_index)
def _random_py(self):
"""python function used for easy testing"""
cdef int current_index = self._get_random_index()
return self._sample_py(current_index)
def _sample_py(self, int current_index):
"""python function used for easy testing"""
cdef {{c_type}}* x_data_ptr
cdef int* x_indices_ptr
cdef int nnz, j
cdef {{c_type}} y, sample_weight
# call _sample in cython
self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight,
current_index)
# transform the pointed data in numpy CSR array
cdef {{c_type}}[:] x_data = np.empty(nnz, dtype={{np_type}})
cdef int[:] x_indices = np.empty(nnz, dtype=np.int32)
cdef int[:] x_indptr = np.asarray([0, nnz], dtype=np.int32)
for j in range(nnz):
x_data[j] = x_data_ptr[j]
x_indices[j] = x_indices_ptr[j]
cdef int sample_idx = self.index_data_ptr[current_index]
return (
(np.asarray(x_data), np.asarray(x_indices), np.asarray(x_indptr)),
y,
sample_weight,
sample_idx,
)
cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
"""Dataset backed by a two-dimensional numpy array.
The dtype of the numpy array is expected to be ``{{np_type}}`` ({{c_type}})
and C-style memory layout.
"""
def __cinit__(
self,
const {{c_type}}[:, ::1] X,
const {{c_type}}[::1] Y,
const {{c_type}}[::1] sample_weights,
uint32_t seed=1,
):
"""A ``SequentialDataset`` backed by a two-dimensional numpy array.
Parameters
----------
X : ndarray, dtype={{c_type}}, ndim=2, mode='c'
The sample array, of shape(n_samples, n_features)
Y : ndarray, dtype={{c_type}}, ndim=1, mode='c'
The target array, of shape(n_samples, )
sample_weights : ndarray, dtype={{c_type}}, ndim=1, mode='c'
The weight of each sample, of shape(n_samples,)
"""
if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX:
raise ValueError("More than %d samples or features not supported;"
" got (%d, %d)."
% (INT_MAX, X.shape[0], X.shape[1]))
# keep a reference to the data to prevent garbage collection
self.X = X
self.Y = Y
self.sample_weights = sample_weights
self.n_samples = X.shape[0]
self.n_features = X.shape[1]
self.feature_indices = np.arange(0, self.n_features, dtype=np.intc)
self.feature_indices_ptr = <int *> &self.feature_indices[0]
self.current_index = -1
self.X_stride = X.strides[0] // X.itemsize
self.X_data_ptr = <{{c_type}} *> &X[0, 0]
self.Y_data_ptr = <{{c_type}} *> &Y[0]
self.sample_weight_data = <{{c_type}} *> &sample_weights[0]
# Use index array for fast shuffling
self.index = np.arange(0, self.n_samples, dtype=np.intc)
self.index_data_ptr = <int *> &self.index[0]
# seed should not be 0 for our_rand_r
self.seed = max(seed, 1)
cdef void _sample(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight,
int current_index) noexcept nogil:
cdef long long sample_idx = self.index_data_ptr[current_index]
cdef long long offset = sample_idx * self.X_stride
y[0] = self.Y_data_ptr[sample_idx]
x_data_ptr[0] = self.X_data_ptr + offset
x_ind_ptr[0] = self.feature_indices_ptr
nnz[0] = self.n_features
sample_weight[0] = self.sample_weight_data[sample_idx]
cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
"""A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
def __cinit__(
self,
const {{c_type}}[::1] X_data,
const int[::1] X_indptr,
const int[::1] X_indices,
const {{c_type}}[::1] Y,
const {{c_type}}[::1] sample_weights,
uint32_t seed=1,
):
"""Dataset backed by a scipy sparse CSR matrix.
The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
The corresponding feature values are given by
x_data_ptr[0:nnz].
Parameters
----------
X_data : ndarray, dtype={{c_type}}, ndim=1, mode='c'
The data array of the CSR features matrix.
X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c'
The index pointer array of the CSR features matrix.
X_indices : ndarray, dtype=np.intc, ndim=1, mode='c'
The column indices array of the CSR features matrix.
Y : ndarray, dtype={{c_type}}, ndim=1, mode='c'
The target values.
sample_weights : ndarray, dtype={{c_type}}, ndim=1, mode='c'
The weight of each sample.
"""
# keep a reference to the data to prevent garbage collection
self.X_data = X_data
self.X_indptr = X_indptr
self.X_indices = X_indices
self.Y = Y
self.sample_weights = sample_weights
self.n_samples = Y.shape[0]
self.current_index = -1
self.X_data_ptr = <{{c_type}} *> &X_data[0]
self.X_indptr_ptr = <int *> &X_indptr[0]
self.X_indices_ptr = <int *> &X_indices[0]
self.Y_data_ptr = <{{c_type}} *> &Y[0]
self.sample_weight_data = <{{c_type}} *> &sample_weights[0]
# Use index array for fast shuffling
self.index = np.arange(self.n_samples, dtype=np.intc)
self.index_data_ptr = <int *> &self.index[0]
# seed should not be 0 for our_rand_r
self.seed = max(seed, 1)
cdef void _sample(self, {{c_type}} **x_data_ptr, int **x_ind_ptr,
int *nnz, {{c_type}} *y, {{c_type}} *sample_weight,
int current_index) noexcept nogil:
cdef long long sample_idx = self.index_data_ptr[current_index]
cdef long long offset = self.X_indptr_ptr[sample_idx]
y[0] = self.Y_data_ptr[sample_idx]
x_data_ptr[0] = self.X_data_ptr + offset
x_ind_ptr[0] = self.X_indices_ptr + offset
nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
sample_weight[0] = self.sample_weight_data[sample_idx]
{{endfor}}
@@ -0,0 +1,459 @@
import importlib
from functools import wraps
from typing import Protocol, runtime_checkable
import numpy as np
from scipy.sparse import issparse
from .._config import get_config
from ._available_if import available_if
def check_library_installed(library):
"""Check library is installed."""
try:
return importlib.import_module(library)
except ImportError as exc:
raise ImportError(
f"Setting output container to '{library}' requires {library} to be"
" installed"
) from exc
def get_columns(columns):
if callable(columns):
try:
return columns()
except Exception:
return None
return columns
@runtime_checkable
class ContainerAdapterProtocol(Protocol):
container_lib: str
def create_container(self, X_output, X_original, columns, inplace=False):
"""Create container from `X_output` with additional metadata.
Parameters
----------
X_output : {ndarray, dataframe}
Data to wrap.
X_original : {ndarray, dataframe}
Original input dataframe. This is used to extract the metadata that should
be passed to `X_output`, e.g. pandas row index.
columns : callable, ndarray, or None
The column names or a callable that returns the column names. The
callable is useful if the column names require some computation. If `None`,
then no columns are passed to the container's constructor.
inplace : bool, default=False
Whether or not we intend to modify `X_output` in-place. However, it does
not guarantee that we return the same object if the in-place operation
is not possible.
Returns
-------
wrapped_output : container_type
`X_output` wrapped into the container type.
"""
def is_supported_container(self, X):
"""Return True if X is a supported container.
Parameters
----------
Xs: container
Containers to be checked.
Returns
-------
is_supported_container : bool
True if X is a supported container.
"""
def rename_columns(self, X, columns):
"""Rename columns in `X`.
Parameters
----------
X : container
Container which columns is updated.
columns : ndarray of str
Columns to update the `X`'s columns with.
Returns
-------
updated_container : container
Container with new names.
"""
def hstack(self, Xs):
"""Stack containers horizontally (column-wise).
Parameters
----------
Xs : list of containers
List of containers to stack.
Returns
-------
stacked_Xs : container
Stacked containers.
"""
class PandasAdapter:
container_lib = "pandas"
def create_container(self, X_output, X_original, columns, inplace=True):
pd = check_library_installed("pandas")
columns = get_columns(columns)
if not inplace or not isinstance(X_output, pd.DataFrame):
# In all these cases, we need to create a new DataFrame
# Unfortunately, we cannot use `getattr(container, "index")`
# because `list` exposes an `index` attribute.
if isinstance(X_output, pd.DataFrame):
index = X_output.index
elif isinstance(X_original, pd.DataFrame):
index = X_original.index
else:
index = None
# We don't pass columns here because it would intend columns selection
# instead of renaming.
X_output = pd.DataFrame(X_output, index=index, copy=not inplace)
if columns is not None:
return self.rename_columns(X_output, columns)
return X_output
def is_supported_container(self, X):
pd = check_library_installed("pandas")
return isinstance(X, pd.DataFrame)
def rename_columns(self, X, columns):
# we cannot use `rename` since it takes a dictionary and at this stage we have
# potentially duplicate column names in `X`
X.columns = columns
return X
def hstack(self, Xs):
pd = check_library_installed("pandas")
return pd.concat(Xs, axis=1)
class PolarsAdapter:
container_lib = "polars"
def create_container(self, X_output, X_original, columns, inplace=True):
pl = check_library_installed("polars")
columns = get_columns(columns)
columns = columns.tolist() if isinstance(columns, np.ndarray) else columns
if not inplace or not isinstance(X_output, pl.DataFrame):
# In all these cases, we need to create a new DataFrame
return pl.DataFrame(X_output, schema=columns, orient="row")
if columns is not None:
return self.rename_columns(X_output, columns)
return X_output
def is_supported_container(self, X):
pl = check_library_installed("polars")
return isinstance(X, pl.DataFrame)
def rename_columns(self, X, columns):
# we cannot use `rename` since it takes a dictionary and at this stage we have
# potentially duplicate column names in `X`
X.columns = columns
return X
def hstack(self, Xs):
pl = check_library_installed("polars")
return pl.concat(Xs, how="horizontal")
class ContainerAdaptersManager:
def __init__(self):
self.adapters = {}
@property
def supported_outputs(self):
return {"default"} | set(self.adapters)
def register(self, adapter):
self.adapters[adapter.container_lib] = adapter
ADAPTERS_MANAGER = ContainerAdaptersManager()
ADAPTERS_MANAGER.register(PandasAdapter())
ADAPTERS_MANAGER.register(PolarsAdapter())
def _get_adapter_from_container(container):
"""Get the adapter that knows how to handle such container.
See :class:`sklearn.utils._set_output.ContainerAdapterProtocol` for more
details.
"""
module_name = container.__class__.__module__.split(".")[0]
try:
return ADAPTERS_MANAGER.adapters[module_name]
except KeyError as exc:
available_adapters = list(ADAPTERS_MANAGER.adapters.keys())
raise ValueError(
"The container does not have a registered adapter in scikit-learn. "
f"Available adapters are: {available_adapters} while the container "
f"provided is: {container!r}."
) from exc
def _get_container_adapter(method, estimator=None):
"""Get container adapter."""
dense_config = _get_output_config(method, estimator)["dense"]
try:
return ADAPTERS_MANAGER.adapters[dense_config]
except KeyError:
return None
def _get_output_config(method, estimator=None):
"""Get output config based on estimator and global configuration.
Parameters
----------
method : {"transform"}
Estimator's method for which the output container is looked up.
estimator : estimator instance or None
Estimator to get the output configuration from. If `None`, check global
configuration is used.
Returns
-------
config : dict
Dictionary with keys:
- "dense": specifies the dense container for `method`. This can be
`"default"` or `"pandas"`.
"""
est_sklearn_output_config = getattr(estimator, "_sklearn_output_config", {})
if method in est_sklearn_output_config:
dense_config = est_sklearn_output_config[method]
else:
dense_config = get_config()[f"{method}_output"]
supported_outputs = ADAPTERS_MANAGER.supported_outputs
if dense_config not in supported_outputs:
raise ValueError(
f"output config must be in {sorted(supported_outputs)}, got {dense_config}"
)
return {"dense": dense_config}
def _wrap_data_with_container(method, data_to_wrap, original_input, estimator):
"""Wrap output with container based on an estimator's or global config.
Parameters
----------
method : {"transform"}
Estimator's method to get container output for.
data_to_wrap : {ndarray, dataframe}
Data to wrap with container.
original_input : {ndarray, dataframe}
Original input of function.
estimator : estimator instance
Estimator with to get the output configuration from.
Returns
-------
output : {ndarray, dataframe}
If the output config is "default" or the estimator is not configured
for wrapping return `data_to_wrap` unchanged.
If the output config is "pandas", return `data_to_wrap` as a pandas
DataFrame.
"""
output_config = _get_output_config(method, estimator)
if output_config["dense"] == "default" or not _auto_wrap_is_configured(estimator):
return data_to_wrap
dense_config = output_config["dense"]
if issparse(data_to_wrap):
raise ValueError(
"The transformer outputs a scipy sparse matrix. "
"Try to set the transformer output to a dense array or disable "
f"{dense_config.capitalize()} output with set_output(transform='default')."
)
adapter = ADAPTERS_MANAGER.adapters[dense_config]
return adapter.create_container(
data_to_wrap,
original_input,
columns=estimator.get_feature_names_out,
)
def _wrap_method_output(f, method):
"""Wrapper used by `_SetOutputMixin` to automatically wrap methods."""
@wraps(f)
def wrapped(self, X, *args, **kwargs):
data_to_wrap = f(self, X, *args, **kwargs)
if isinstance(data_to_wrap, tuple):
# only wrap the first output for cross decomposition
return_tuple = (
_wrap_data_with_container(method, data_to_wrap[0], X, self),
*data_to_wrap[1:],
)
# Support for namedtuples `_make` is a documented API for namedtuples:
# https://docs.python.org/3/library/collections.html#collections.somenamedtuple._make
if hasattr(type(data_to_wrap), "_make"):
return type(data_to_wrap)._make(return_tuple)
return return_tuple
return _wrap_data_with_container(method, data_to_wrap, X, self)
return wrapped
def _auto_wrap_is_configured(estimator):
"""Return True if estimator is configured for auto-wrapping the transform method.
`_SetOutputMixin` sets `_sklearn_auto_wrap_output_keys` to `set()` if auto wrapping
is manually disabled.
"""
auto_wrap_output_keys = getattr(estimator, "_sklearn_auto_wrap_output_keys", set())
return (
hasattr(estimator, "get_feature_names_out")
and "transform" in auto_wrap_output_keys
)
class _SetOutputMixin:
"""Mixin that dynamically wraps methods to return container based on config.
Currently `_SetOutputMixin` wraps `transform` and `fit_transform` and configures
it based on `set_output` of the global configuration.
`set_output` is only defined if `get_feature_names_out` is defined and
`auto_wrap_output_keys` is the default value.
"""
def __init_subclass__(cls, auto_wrap_output_keys=("transform",), **kwargs):
super().__init_subclass__(**kwargs)
# Dynamically wraps `transform` and `fit_transform` and configure it's
# output based on `set_output`.
if not (
isinstance(auto_wrap_output_keys, tuple) or auto_wrap_output_keys is None
):
raise ValueError("auto_wrap_output_keys must be None or a tuple of keys.")
if auto_wrap_output_keys is None:
cls._sklearn_auto_wrap_output_keys = set()
return
# Mapping from method to key in configurations
method_to_key = {
"transform": "transform",
"fit_transform": "transform",
}
cls._sklearn_auto_wrap_output_keys = set()
for method, key in method_to_key.items():
if not hasattr(cls, method) or key not in auto_wrap_output_keys:
continue
cls._sklearn_auto_wrap_output_keys.add(key)
# Only wrap methods defined by cls itself
if method not in cls.__dict__:
continue
wrapped_method = _wrap_method_output(getattr(cls, method), key)
setattr(cls, method, wrapped_method)
@available_if(_auto_wrap_is_configured)
def set_output(self, *, transform=None):
"""Set output container.
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
for an example on how to use the API.
Parameters
----------
transform : {"default", "pandas", "polars"}, default=None
Configure output of `transform` and `fit_transform`.
- `"default"`: Default output format of a transformer
- `"pandas"`: DataFrame output
- `"polars"`: Polars output
- `None`: Transform configuration is unchanged
.. versionadded:: 1.4
`"polars"` option was added.
Returns
-------
self : estimator instance
Estimator instance.
"""
if transform is None:
return self
if not hasattr(self, "_sklearn_output_config"):
self._sklearn_output_config = {}
self._sklearn_output_config["transform"] = transform
return self
def _safe_set_output(estimator, *, transform=None):
"""Safely call estimator.set_output and error if it not available.
This is used by meta-estimators to set the output for child estimators.
Parameters
----------
estimator : estimator instance
Estimator instance.
transform : {"default", "pandas", "polars"}, default=None
Configure output of the following estimator's methods:
- `"transform"`
- `"fit_transform"`
If `None`, this operation is a no-op.
Returns
-------
estimator : estimator instance
Estimator instance.
"""
set_output_for_transform = (
hasattr(estimator, "transform")
or hasattr(estimator, "fit_transform")
and transform is not None
)
if not set_output_for_transform:
# If estimator can not transform, then `set_output` does not need to be
# called.
return
if not hasattr(estimator, "set_output"):
raise ValueError(
f"Unable to configure output for {estimator} because `set_output` "
"is not available."
)
return estimator.set_output(transform=transform)
@@ -0,0 +1,114 @@
"""
Utility methods to print system info for debugging
adapted from :func:`pandas.show_versions`
"""
# License: BSD 3 clause
import platform
import sys
from threadpoolctl import threadpool_info
from .. import __version__
from ._openmp_helpers import _openmp_parallelism_enabled
def _get_sys_info():
"""System information
Returns
-------
sys_info : dict
system and Python version information
"""
python = sys.version.replace("\n", " ")
blob = [
("python", python),
("executable", sys.executable),
("machine", platform.platform()),
]
return dict(blob)
def _get_deps_info():
"""Overview of the installed version of main dependencies
This function does not import the modules to collect the version numbers
but instead relies on standard Python package metadata.
Returns
-------
deps_info: dict
version information on relevant Python libraries
"""
deps = [
"pip",
"setuptools",
"numpy",
"scipy",
"Cython",
"pandas",
"matplotlib",
"joblib",
"threadpoolctl",
]
deps_info = {
"sklearn": __version__,
}
from importlib.metadata import PackageNotFoundError, version
for modname in deps:
try:
deps_info[modname] = version(modname)
except PackageNotFoundError:
deps_info[modname] = None
return deps_info
def show_versions():
"""Print useful debugging information"
.. versionadded:: 0.20
Examples
--------
>>> from sklearn import show_versions
>>> show_versions() # doctest: +SKIP
"""
sys_info = _get_sys_info()
deps_info = _get_deps_info()
print("\nSystem:")
for k, stat in sys_info.items():
print("{k:>10}: {stat}".format(k=k, stat=stat))
print("\nPython dependencies:")
for k, stat in deps_info.items():
print("{k:>13}: {stat}".format(k=k, stat=stat))
print(
"\n{k}: {stat}".format(
k="Built with OpenMP", stat=_openmp_parallelism_enabled()
)
)
# show threadpoolctl results
threadpool_results = threadpool_info()
if threadpool_results:
print()
print("threadpoolctl info:")
for i, result in enumerate(threadpool_results):
for key, val in result.items():
print(f"{key:>15}: {val}")
if i != len(threadpool_results) - 1:
print()
@@ -0,0 +1,9 @@
from ._typedefs cimport intp_t
from cython cimport floating
cdef int simultaneous_sort(
floating *dist,
intp_t *idx,
intp_t size,
) noexcept nogil
@@ -0,0 +1,93 @@
from cython cimport floating
cdef inline void dual_swap(
floating* darr,
intp_t *iarr,
intp_t a,
intp_t b,
) noexcept nogil:
"""Swap the values at index a and b of both darr and iarr"""
cdef floating dtmp = darr[a]
darr[a] = darr[b]
darr[b] = dtmp
cdef intp_t itmp = iarr[a]
iarr[a] = iarr[b]
iarr[b] = itmp
cdef int simultaneous_sort(
floating* values,
intp_t* indices,
intp_t size,
) noexcept nogil:
"""
Perform a recursive quicksort on the values array as to sort them ascendingly.
This simultaneously performs the swaps on both the values and the indices arrays.
The numpy equivalent is:
def simultaneous_sort(dist, idx):
i = np.argsort(dist)
return dist[i], idx[i]
Notes
-----
Arrays are manipulated via a pointer to there first element and their size
as to ease the processing of dynamically allocated buffers.
"""
# TODO: In order to support discrete distance metrics, we need to have a
# simultaneous sort which breaks ties on indices when distances are identical.
# The best might be using a std::stable_sort and a Comparator which might need
# an Array of Structures (AoS) instead of the Structure of Arrays (SoA)
# currently used.
cdef:
intp_t pivot_idx, i, store_idx
floating pivot_val
# in the small-array case, do things efficiently
if size <= 1:
pass
elif size == 2:
if values[0] > values[1]:
dual_swap(values, indices, 0, 1)
elif size == 3:
if values[0] > values[1]:
dual_swap(values, indices, 0, 1)
if values[1] > values[2]:
dual_swap(values, indices, 1, 2)
if values[0] > values[1]:
dual_swap(values, indices, 0, 1)
else:
# Determine the pivot using the median-of-three rule.
# The smallest of the three is moved to the beginning of the array,
# the middle (the pivot value) is moved to the end, and the largest
# is moved to the pivot index.
pivot_idx = size // 2
if values[0] > values[size - 1]:
dual_swap(values, indices, 0, size - 1)
if values[size - 1] > values[pivot_idx]:
dual_swap(values, indices, size - 1, pivot_idx)
if values[0] > values[size - 1]:
dual_swap(values, indices, 0, size - 1)
pivot_val = values[size - 1]
# Partition indices about pivot. At the end of this operation,
# pivot_idx will contain the pivot value, everything to the left
# will be smaller, and everything to the right will be larger.
store_idx = 0
for i in range(size - 1):
if values[i] < pivot_val:
dual_swap(values, indices, i, store_idx)
store_idx += 1
dual_swap(values, indices, store_idx, size - 1)
pivot_idx = store_idx
# Recursively sort each side of the pivot
if pivot_idx > 1:
simultaneous_sort(values, indices, pivot_idx)
if pivot_idx + 2 < size:
simultaneous_sort(values + pivot_idx + 1,
indices + pivot_idx + 1,
size - pivot_idx - 1)
return 0
@@ -0,0 +1,68 @@
import numpy as np
_DEFAULT_TAGS = {
"array_api_support": False,
"non_deterministic": False,
"requires_positive_X": False,
"requires_positive_y": False,
"X_types": ["2darray"],
"poor_score": False,
"no_validation": False,
"multioutput": False,
"allow_nan": False,
"stateless": False,
"multilabel": False,
"_skip_test": False,
"_xfail_checks": False,
"multioutput_only": False,
"binary_only": False,
"requires_fit": True,
"preserves_dtype": [np.float64],
"requires_y": False,
"pairwise": False,
}
def _safe_tags(estimator, key=None):
"""Safely get estimator tags.
:class:`~sklearn.BaseEstimator` provides the estimator tags machinery.
However, if an estimator does not inherit from this base class, we should
fall-back to the default tags.
For scikit-learn built-in estimators, we should still rely on
`self._get_tags()`. `_safe_tags(est)` should be used when we are not sure
where `est` comes from: typically `_safe_tags(self.base_estimator)` where
`self` is a meta-estimator, or in the common checks.
Parameters
----------
estimator : estimator object
The estimator from which to get the tag.
key : str, default=None
Tag name to get. By default (`None`), all tags are returned.
Returns
-------
tags : dict or tag value
The estimator tags. A single value is returned if `key` is not None.
"""
if hasattr(estimator, "_get_tags"):
tags_provider = "_get_tags()"
tags = estimator._get_tags()
elif hasattr(estimator, "_more_tags"):
tags_provider = "_more_tags()"
tags = {**_DEFAULT_TAGS, **estimator._more_tags()}
else:
tags_provider = "_DEFAULT_TAGS"
tags = _DEFAULT_TAGS
if key is not None:
if key not in tags:
raise ValueError(
f"The key {key} is not defined in {tags_provider} for the "
f"class {estimator.__class__.__name__}."
)
return tags[key]
return tags
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,41 @@
# Commonly used types
# These are redefinitions of the ones defined by numpy in
# https://github.com/numpy/numpy/blob/main/numpy/__init__.pxd.
# It will eventually avoid having to always include the numpy headers even when we
# would only use it for the types.
#
# When used to declare variables that will receive values from numpy arrays, it
# should match the dtype of the array. For example, to declare a variable that will
# receive values from a numpy array of dtype np.float64, the type float64_t must be
# used.
#
# TODO: Stop defining custom types locally or globally like DTYPE_t and friends and
# use these consistently throughout the codebase.
# NOTE: Extend this list as needed when converting more cython extensions.
ctypedef unsigned char uint8_t
ctypedef unsigned int uint32_t
ctypedef unsigned long long uint64_t
# Note: In NumPy 2, indexing always happens with npy_intp which is an alias for
# the Py_ssize_t type, see PEP 353.
#
# Note that on most platforms Py_ssize_t is equivalent to C99's intptr_t,
# but they can differ on architecture with segmented memory (none
# supported by scikit-learn at the time of writing).
#
# intp_t/np.intp should be used to index arrays in a platform dependent way.
# Storing arrays with platform dependent dtypes as attribute on picklable
# objects is not recommended as it requires special care when loading and
# using such datastructures on a host with different bitness. Instead one
# should rather use fixed width integer types such as int32 or uint32 when we know
# that the number of elements to index is not larger to 2 or 4 billions.
ctypedef Py_ssize_t intp_t
ctypedef float float32_t
ctypedef double float64_t
# Sparse matrices indices and indices' pointers arrays must use int32_t over
# intp_t because intp_t is platform dependent.
# When large sparse matrices are supported, indexing must use int64_t.
# See https://github.com/scikit-learn/scikit-learn/issues/23653 which tracks the
# ongoing work to support large sparse matrices.
ctypedef signed char int8_t
ctypedef signed int int32_t
ctypedef signed long long int64_t
@@ -0,0 +1,23 @@
# _typedefs is a declaration only module
#
# The functions implemented here are for testing purpose only.
import numpy as np
ctypedef fused testing_type_t:
float32_t
float64_t
int8_t
int32_t
int64_t
intp_t
uint8_t
uint32_t
uint64_t
def testing_make_array_from_typed_val(testing_type_t val):
cdef testing_type_t[:] val_view = <testing_type_t[:1]>&val
return np.asarray(val_view)
@@ -0,0 +1,54 @@
import timeit
from contextlib import contextmanager
def _message_with_time(source, message, time):
"""Create one line message for logging purposes.
Parameters
----------
source : str
String indicating the source or the reference of the message.
message : str
Short message.
time : int
Time in seconds.
"""
start_message = "[%s] " % source
# adapted from joblib.logger.short_format_time without the Windows -.1s
# adjustment
if time > 60:
time_str = "%4.1fmin" % (time / 60)
else:
time_str = " %5.1fs" % time
end_message = " %s, total=%s" % (message, time_str)
dots_len = 70 - len(start_message) - len(end_message)
return "%s%s%s" % (start_message, dots_len * ".", end_message)
@contextmanager
def _print_elapsed_time(source, message=None):
"""Log elapsed time to stdout when the context is exited.
Parameters
----------
source : str
String indicating the source or the reference of the message.
message : str, default=None
Short message. If None, nothing will be printed.
Returns
-------
context_manager
Prints elapsed time upon exit if verbose.
"""
if message is None:
yield
else:
start = timeit.default_timer()
yield
print(_message_with_time(source, message, timeit.default_timer() - start))
@@ -0,0 +1,12 @@
cimport numpy as cnp
from libcpp.vector cimport vector
from ..utils._typedefs cimport intp_t, float64_t, int32_t, int64_t
ctypedef fused vector_typed:
vector[float64_t]
vector[intp_t]
vector[int32_t]
vector[int64_t]
cdef cnp.ndarray vector_to_nd_array(vector_typed * vect_ptr)
@@ -0,0 +1,118 @@
from cython.operator cimport dereference as deref
from cpython.ref cimport Py_INCREF
cimport numpy as cnp
cnp.import_array()
cdef StdVectorSentinel _create_sentinel(vector_typed * vect_ptr):
if vector_typed is vector[float64_t]:
return StdVectorSentinelFloat64.create_for(vect_ptr)
elif vector_typed is vector[int32_t]:
return StdVectorSentinelInt32.create_for(vect_ptr)
elif vector_typed is vector[int64_t]:
return StdVectorSentinelInt64.create_for(vect_ptr)
else: # intp_t
return StdVectorSentinelIntP.create_for(vect_ptr)
cdef class StdVectorSentinel:
"""Wraps a reference to a vector which will be deallocated with this object.
When created, the StdVectorSentinel swaps the reference of its internal
vectors with the provided one (vect_ptr), thus making the StdVectorSentinel
manage the provided one's lifetime.
"""
cdef void* get_data(self):
"""Return pointer to data."""
cdef int get_typenum(self):
"""Get typenum for PyArray_SimpleNewFromData."""
cdef class StdVectorSentinelFloat64(StdVectorSentinel):
cdef vector[float64_t] vec
@staticmethod
cdef StdVectorSentinel create_for(vector[float64_t] * vect_ptr):
# This initializes the object directly without calling __init__
# See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa
cdef StdVectorSentinelFloat64 sentinel = StdVectorSentinelFloat64.__new__(StdVectorSentinelFloat64)
sentinel.vec.swap(deref(vect_ptr))
return sentinel
cdef void* get_data(self):
return self.vec.data()
cdef int get_typenum(self):
return cnp.NPY_FLOAT64
cdef class StdVectorSentinelIntP(StdVectorSentinel):
cdef vector[intp_t] vec
@staticmethod
cdef StdVectorSentinel create_for(vector[intp_t] * vect_ptr):
# This initializes the object directly without calling __init__
# See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa
cdef StdVectorSentinelIntP sentinel = StdVectorSentinelIntP.__new__(StdVectorSentinelIntP)
sentinel.vec.swap(deref(vect_ptr))
return sentinel
cdef void* get_data(self):
return self.vec.data()
cdef int get_typenum(self):
return cnp.NPY_INTP
cdef class StdVectorSentinelInt32(StdVectorSentinel):
cdef vector[int32_t] vec
@staticmethod
cdef StdVectorSentinel create_for(vector[int32_t] * vect_ptr):
# This initializes the object directly without calling __init__
# See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa
cdef StdVectorSentinelInt32 sentinel = StdVectorSentinelInt32.__new__(StdVectorSentinelInt32)
sentinel.vec.swap(deref(vect_ptr))
return sentinel
cdef void* get_data(self):
return self.vec.data()
cdef int get_typenum(self):
return cnp.NPY_INT32
cdef class StdVectorSentinelInt64(StdVectorSentinel):
cdef vector[int64_t] vec
@staticmethod
cdef StdVectorSentinel create_for(vector[int64_t] * vect_ptr):
# This initializes the object directly without calling __init__
# See: https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#instantiation-from-existing-c-c-pointers # noqa
cdef StdVectorSentinelInt64 sentinel = StdVectorSentinelInt64.__new__(StdVectorSentinelInt64)
sentinel.vec.swap(deref(vect_ptr))
return sentinel
cdef void* get_data(self):
return self.vec.data()
cdef int get_typenum(self):
return cnp.NPY_INT64
cdef cnp.ndarray vector_to_nd_array(vector_typed * vect_ptr):
cdef:
cnp.npy_intp size = deref(vect_ptr).size()
StdVectorSentinel sentinel = _create_sentinel(vect_ptr)
cnp.ndarray arr = cnp.PyArray_SimpleNewFromData(
1, &size, sentinel.get_typenum(), sentinel.get_data())
# Makes the numpy array responsible of the life-cycle of its buffer.
# A reference to the StdVectorSentinel will be stolen by the call to
# `PyArray_SetBaseObject` below, so we increase its reference counter.
# See: https://docs.python.org/3/c-api/intro.html#reference-count-details
Py_INCREF(sentinel)
cnp.PyArray_SetBaseObject(arr, sentinel)
return arr
@@ -0,0 +1,45 @@
{{py:
"""
Efficient (dense) parameter vector implementation for linear models.
Template file for easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
Generated file: weight_vector.pxd
Each class is duplicated for all dtypes (float and double). The keywords
between double braces are substituted in setup.py.
"""
# name_suffix, c_type
dtypes = [('64', 'double'),
('32', 'float')]
}}
{{for name_suffix, c_type in dtypes}}
cdef class WeightVector{{name_suffix}}(object):
cdef readonly {{c_type}}[::1] w
cdef readonly {{c_type}}[::1] aw
cdef {{c_type}} *w_data_ptr
cdef {{c_type}} *aw_data_ptr
cdef double wscale
cdef double average_a
cdef double average_b
cdef int n_features
cdef double sq_norm
cdef void add(self, {{c_type}} *x_data_ptr, int *x_ind_ptr,
int xnnz, {{c_type}} c) noexcept nogil
cdef void add_average(self, {{c_type}} *x_data_ptr, int *x_ind_ptr,
int xnnz, {{c_type}} c, {{c_type}} num_iter) noexcept nogil
cdef {{c_type}} dot(self, {{c_type}} *x_data_ptr, int *x_ind_ptr,
int xnnz) noexcept nogil
cdef void scale(self, {{c_type}} c) noexcept nogil
cdef void reset_wscale(self) noexcept nogil
cdef {{c_type}} norm(self) noexcept nogil
{{endfor}}
@@ -0,0 +1,210 @@
{{py:
"""
Efficient (dense) parameter vector implementation for linear models.
Template file for easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
Generated file: weight_vector.pxd
Each class is duplicated for all dtypes (float and double). The keywords
between double braces are substituted in setup.py.
"""
# name_suffix, c_type, reset_wscale_threshold
dtypes = [('64', 'double', 1e-9),
('32', 'float', 1e-6)]
}}
# cython: binding=False
#
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Lars Buitinck
# Danny Sullivan <dsullivan7@hotmail.com>
#
# License: BSD 3 clause
cimport cython
from libc.limits cimport INT_MAX
from libc.math cimport sqrt
from ._cython_blas cimport _dot, _scal, _axpy
{{for name_suffix, c_type, reset_wscale_threshold in dtypes}}
cdef class WeightVector{{name_suffix}}(object):
"""Dense vector represented by a scalar and a numpy array.
The class provides methods to ``add`` a sparse vector
and scale the vector.
Representing a vector explicitly as a scalar times a
vector allows for efficient scaling operations.
Attributes
----------
w : ndarray, dtype={{c_type}}, order='C'
The numpy array which backs the weight vector.
aw : ndarray, dtype={{c_type}}, order='C'
The numpy array which backs the average_weight vector.
w_data_ptr : {{c_type}}*
A pointer to the data of the numpy array.
wscale : {{c_type}}
The scale of the vector.
n_features : int
The number of features (= dimensionality of ``w``).
sq_norm : {{c_type}}
The squared norm of ``w``.
"""
def __cinit__(self,
{{c_type}}[::1] w,
{{c_type}}[::1] aw):
if w.shape[0] > INT_MAX:
raise ValueError("More than %d features not supported; got %d."
% (INT_MAX, w.shape[0]))
self.w = w
self.w_data_ptr = &w[0]
self.wscale = 1.0
self.n_features = w.shape[0]
self.sq_norm = _dot(self.n_features, self.w_data_ptr, 1, self.w_data_ptr, 1)
self.aw = aw
if self.aw is not None:
self.aw_data_ptr = &aw[0]
self.average_a = 0.0
self.average_b = 1.0
cdef void add(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz,
{{c_type}} c) noexcept nogil:
"""Scales sample x by constant c and adds it to the weight vector.
This operation updates ``sq_norm``.
Parameters
----------
x_data_ptr : {{c_type}}*
The array which holds the feature values of ``x``.
x_ind_ptr : np.intc*
The array which holds the feature indices of ``x``.
xnnz : int
The number of non-zero features of ``x``.
c : {{c_type}}
The scaling constant for the example.
"""
cdef int j
cdef int idx
cdef double val
cdef double innerprod = 0.0
cdef double xsqnorm = 0.0
# the next two lines save a factor of 2!
cdef {{c_type}} wscale = self.wscale
cdef {{c_type}}* w_data_ptr = self.w_data_ptr
for j in range(xnnz):
idx = x_ind_ptr[j]
val = x_data_ptr[j]
innerprod += (w_data_ptr[idx] * val)
xsqnorm += (val * val)
w_data_ptr[idx] += val * (c / wscale)
self.sq_norm += (xsqnorm * c * c) + (2.0 * innerprod * wscale * c)
# Update the average weights according to the sparse trick defined
# here: https://research.microsoft.com/pubs/192769/tricks-2012.pdf
# by Leon Bottou
cdef void add_average(self, {{c_type}} *x_data_ptr, int *x_ind_ptr, int xnnz,
{{c_type}} c, {{c_type}} num_iter) noexcept nogil:
"""Updates the average weight vector.
Parameters
----------
x_data_ptr : {{c_type}}*
The array which holds the feature values of ``x``.
x_ind_ptr : np.intc*
The array which holds the feature indices of ``x``.
xnnz : int
The number of non-zero features of ``x``.
c : {{c_type}}
The scaling constant for the example.
num_iter : {{c_type}}
The total number of iterations.
"""
cdef int j
cdef int idx
cdef double val
cdef double mu = 1.0 / num_iter
cdef double average_a = self.average_a
cdef double wscale = self.wscale
cdef {{c_type}}* aw_data_ptr = self.aw_data_ptr
for j in range(xnnz):
idx = x_ind_ptr[j]
val = x_data_ptr[j]
aw_data_ptr[idx] += (self.average_a * val * (-c / wscale))
# Once the sample has been processed
# update the average_a and average_b
if num_iter > 1:
self.average_b /= (1.0 - mu)
self.average_a += mu * self.average_b * wscale
cdef {{c_type}} dot(self, {{c_type}} *x_data_ptr, int *x_ind_ptr,
int xnnz) noexcept nogil:
"""Computes the dot product of a sample x and the weight vector.
Parameters
----------
x_data_ptr : {{c_type}}*
The array which holds the feature values of ``x``.
x_ind_ptr : np.intc*
The array which holds the feature indices of ``x``.
xnnz : int
The number of non-zero features of ``x`` (length of x_ind_ptr).
Returns
-------
innerprod : {{c_type}}
The inner product of ``x`` and ``w``.
"""
cdef int j
cdef int idx
cdef double innerprod = 0.0
cdef {{c_type}}* w_data_ptr = self.w_data_ptr
for j in range(xnnz):
idx = x_ind_ptr[j]
innerprod += w_data_ptr[idx] * x_data_ptr[j]
innerprod *= self.wscale
return innerprod
cdef void scale(self, {{c_type}} c) noexcept nogil:
"""Scales the weight vector by a constant ``c``.
It updates ``wscale`` and ``sq_norm``. If ``wscale`` gets too
small we call ``reset_swcale``."""
self.wscale *= c
self.sq_norm *= (c * c)
if self.wscale < {{reset_wscale_threshold}}:
self.reset_wscale()
cdef void reset_wscale(self) noexcept nogil:
"""Scales each coef of ``w`` by ``wscale`` and resets it to 1. """
if self.aw_data_ptr != NULL:
_axpy(self.n_features, self.average_a,
self.w_data_ptr, 1, self.aw_data_ptr, 1)
_scal(self.n_features, 1.0 / self.average_b, self.aw_data_ptr, 1)
self.average_a = 0.0
self.average_b = 1.0
_scal(self.n_features, self.wscale, self.w_data_ptr, 1)
self.wscale = 1.0
cdef {{c_type}} norm(self) noexcept nogil:
"""The L2 norm of the weight vector. """
return sqrt(self.sq_norm)
{{endfor}}
@@ -0,0 +1,137 @@
"""
The :mod:`sklearn.utils.arrayfuncs` module includes a small collection of auxiliary
functions that operate on arrays.
"""
from cython cimport floating
from cython.parallel cimport prange
from libc.math cimport fabs
from libc.float cimport DBL_MAX, FLT_MAX
from ._cython_blas cimport _copy, _rotg, _rot
from ._typedefs cimport float64_t
ctypedef fused real_numeric:
short
int
long
long long
float
double
def min_pos(const floating[:] X):
"""Find the minimum value of an array over positive values.
Returns the maximum representable value of the input dtype if none of the
values are positive.
Parameters
----------
X : ndarray of shape (n,)
Input array.
Returns
-------
min_val : float
The smallest positive value in the array, or the maximum representable value
of the input dtype if no positive values are found.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils.arrayfuncs import min_pos
>>> X = np.array([0, -1, 2, 3, -4, 5])
>>> min_pos(X)
2.0
"""
cdef Py_ssize_t i
cdef floating min_val = FLT_MAX if floating is float else DBL_MAX
for i in range(X.size):
if 0. < X[i] < min_val:
min_val = X[i]
return min_val
def _all_with_any_reduction_axis_1(real_numeric[:, :] array, real_numeric value):
"""Check whether any row contains all values equal to `value`.
It is equivalent to `np.any(np.all(X == value, axis=1))`, but it avoids to
materialize the temporary boolean matrices in memory.
Parameters
----------
array: array-like
The array to be checked.
value: short, int, long, float, or double
The value to use for the comparison.
Returns
-------
any_all_equal: bool
Whether or not any rows contains all values equal to `value`.
"""
cdef Py_ssize_t i, j
for i in range(array.shape[0]):
for j in range(array.shape[1]):
if array[i, j] != value:
break
else: # no break
return True
return False
# General Cholesky Delete.
# Remove an element from the cholesky factorization
# m = columns
# n = rows
#
# TODO: put transpose as an option
def cholesky_delete(floating[:, :] L, int go_out):
cdef:
int n = L.shape[0]
int m = L.strides[0]
floating c, s
floating *L1
int i
if floating is float:
m /= sizeof(float)
else:
m /= sizeof(double)
# delete row go_out
L1 = &L[0, 0] + (go_out * m)
for i in range(go_out, n-1):
_copy(i + 2, L1 + m, 1, L1, 1)
L1 += m
L1 = &L[0, 0] + (go_out * m)
for i in range(go_out, n-1):
_rotg(L1 + i, L1 + i + 1, &c, &s)
if L1[i] < 0:
# Diagonals cannot be negative
L1[i] = fabs(L1[i])
c = -c
s = -s
L1[i + 1] = 0. # just for cleanup
L1 += m
_rot(n - i - 2, L1 + i, m, L1 + i + 1, m, c, s)
def sum_parallel(const floating [:] array, int n_threads):
"""Parallel sum, always using float64 internally."""
cdef:
float64_t out = 0.
int i = 0
for i in prange(
array.shape[0], schedule='static', nogil=True, num_threads=n_threads
):
out += array[i]
return out
@@ -0,0 +1,224 @@
"""
The :mod:`sklearn.utils.class_weight` module includes utilities for handling
weights based on class labels.
"""
# Authors: Andreas Mueller
# Manoj Kumar
# License: BSD 3 clause
import numpy as np
from scipy import sparse
from ._param_validation import StrOptions, validate_params
@validate_params(
{
"class_weight": [dict, StrOptions({"balanced"}), None],
"classes": [np.ndarray],
"y": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def compute_class_weight(class_weight, *, classes, y):
"""Estimate class weights for unbalanced datasets.
Parameters
----------
class_weight : dict, "balanced" or None
If "balanced", class weights will be given by
`n_samples / (n_classes * np.bincount(y))`.
If a dictionary is given, keys are classes and values are corresponding class
weights.
If `None` is given, the class weights will be uniform.
classes : ndarray
Array of the classes occurring in the data, as given by
`np.unique(y_org)` with `y_org` the original class labels.
y : array-like of shape (n_samples,)
Array of original class labels per sample.
Returns
-------
class_weight_vect : ndarray of shape (n_classes,)
Array with `class_weight_vect[i]` the weight for i-th class.
References
----------
The "balanced" heuristic is inspired by
Logistic Regression in Rare Events Data, King, Zen, 2001.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils.class_weight import compute_class_weight
>>> y = [1, 1, 1, 1, 0, 0]
>>> compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
array([1.5 , 0.75])
"""
# Import error caused by circular imports.
from ..preprocessing import LabelEncoder
if set(y) - set(classes):
raise ValueError("classes should include all valid labels that can be in y")
if class_weight is None or len(class_weight) == 0:
# uniform class weights
weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
elif class_weight == "balanced":
# Find the weight of each class as present in y.
le = LabelEncoder()
y_ind = le.fit_transform(y)
if not all(np.isin(classes, le.classes_)):
raise ValueError("classes should have valid labels that are in y")
recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64))
weight = recip_freq[le.transform(classes)]
else:
# user-defined dictionary
weight = np.ones(classes.shape[0], dtype=np.float64, order="C")
unweighted_classes = []
for i, c in enumerate(classes):
if c in class_weight:
weight[i] = class_weight[c]
else:
unweighted_classes.append(c)
n_weighted_classes = len(classes) - len(unweighted_classes)
if unweighted_classes and n_weighted_classes != len(class_weight):
unweighted_classes_user_friendly_str = np.array(unweighted_classes).tolist()
raise ValueError(
f"The classes, {unweighted_classes_user_friendly_str}, are not in"
" class_weight"
)
return weight
@validate_params(
{
"class_weight": [dict, list, StrOptions({"balanced"}), None],
"y": ["array-like", "sparse matrix"],
"indices": ["array-like", None],
},
prefer_skip_nested_validation=True,
)
def compute_sample_weight(class_weight, y, *, indices=None):
"""Estimate sample weights by class for unbalanced datasets.
Parameters
----------
class_weight : dict, list of dicts, "balanced", or None
Weights associated with classes in the form `{class_label: weight}`.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
order as the columns of y.
Note that for multioutput (including multilabel) weights should be
defined for each class of every column in its own dict. For example,
for four-class multilabel classification weights should be
`[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}]` instead of
`[{1:1}, {2:5}, {3:1}, {4:1}]`.
The `"balanced"` mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data:
`n_samples / (n_classes * np.bincount(y))`.
For multi-output, the weights of each column of y will be multiplied.
y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)
Array of original class labels per sample.
indices : array-like of shape (n_subsample,), default=None
Array of indices to be used in a subsample. Can be of length less than
`n_samples` in the case of a subsample, or equal to `n_samples` in the
case of a bootstrap subsample with repeated indices. If `None`, the
sample weight will be calculated over the full sample. Only `"balanced"`
is supported for `class_weight` if this is provided.
Returns
-------
sample_weight_vect : ndarray of shape (n_samples,)
Array with sample weights as applied to the original `y`.
Examples
--------
>>> from sklearn.utils.class_weight import compute_sample_weight
>>> y = [1, 1, 1, 1, 0, 0]
>>> compute_sample_weight(class_weight="balanced", y=y)
array([0.75, 0.75, 0.75, 0.75, 1.5 , 1.5 ])
"""
# Ensure y is 2D. Sparse matrices are already 2D.
if not sparse.issparse(y):
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
n_outputs = y.shape[1]
if indices is not None and class_weight != "balanced":
raise ValueError(
"The only valid class_weight for subsampling is 'balanced'. "
f"Given {class_weight}."
)
elif n_outputs > 1:
if class_weight is None or isinstance(class_weight, dict):
raise ValueError(
"For multi-output, class_weight should be a list of dicts, or the "
"string 'balanced'."
)
elif isinstance(class_weight, list) and len(class_weight) != n_outputs:
raise ValueError(
"For multi-output, number of elements in class_weight should match "
f"number of outputs. Got {len(class_weight)} element(s) while having "
f"{n_outputs} outputs."
)
expanded_class_weight = []
for k in range(n_outputs):
if sparse.issparse(y):
# Ok to densify a single column at a time
y_full = y[:, [k]].toarray().flatten()
else:
y_full = y[:, k]
classes_full = np.unique(y_full)
classes_missing = None
if class_weight == "balanced" or n_outputs == 1:
class_weight_k = class_weight
else:
class_weight_k = class_weight[k]
if indices is not None:
# Get class weights for the subsample, covering all classes in
# case some labels that were present in the original data are
# missing from the sample.
y_subsample = y_full[indices]
classes_subsample = np.unique(y_subsample)
weight_k = np.take(
compute_class_weight(
class_weight_k, classes=classes_subsample, y=y_subsample
),
np.searchsorted(classes_subsample, classes_full),
mode="clip",
)
classes_missing = set(classes_full) - set(classes_subsample)
else:
weight_k = compute_class_weight(
class_weight_k, classes=classes_full, y=y_full
)
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
if classes_missing:
# Make missing classes' weight zero
weight_k[np.isin(y_full, list(classes_missing))] = 0.0
expanded_class_weight.append(weight_k)
expanded_class_weight = np.prod(expanded_class_weight, axis=0, dtype=np.float64)
return expanded_class_weight
@@ -0,0 +1,135 @@
import functools
import warnings
__all__ = ["deprecated"]
class deprecated:
"""Decorator to mark a function or class as deprecated.
Issue a warning when the function is called/the class is instantiated and
adds a warning to the docstring.
The optional extra argument will be appended to the deprecation message
and the docstring. Note: to use this with the default value for extra, put
in an empty of parentheses:
Examples
--------
>>> from sklearn.utils import deprecated
>>> deprecated()
<sklearn.utils.deprecation.deprecated object at ...>
>>> @deprecated()
... def some_function(): pass
Parameters
----------
extra : str, default=''
To be added to the deprecation messages.
"""
# Adapted from https://wiki.python.org/moin/PythonDecoratorLibrary,
# but with many changes.
def __init__(self, extra=""):
self.extra = extra
def __call__(self, obj):
"""Call method
Parameters
----------
obj : object
"""
if isinstance(obj, type):
return self._decorate_class(obj)
elif isinstance(obj, property):
# Note that this is only triggered properly if the `deprecated`
# decorator is placed before the `property` decorator, like so:
#
# @deprecated(msg)
# @property
# def deprecated_attribute_(self):
# ...
return self._decorate_property(obj)
else:
return self._decorate_fun(obj)
def _decorate_class(self, cls):
msg = "Class %s is deprecated" % cls.__name__
if self.extra:
msg += "; %s" % self.extra
new = cls.__new__
def wrapped(cls, *args, **kwargs):
warnings.warn(msg, category=FutureWarning)
if new is object.__new__:
return object.__new__(cls)
return new(cls, *args, **kwargs)
cls.__new__ = wrapped
wrapped.__name__ = "__new__"
wrapped.deprecated_original = new
return cls
def _decorate_fun(self, fun):
"""Decorate function fun"""
msg = "Function %s is deprecated" % fun.__name__
if self.extra:
msg += "; %s" % self.extra
@functools.wraps(fun)
def wrapped(*args, **kwargs):
warnings.warn(msg, category=FutureWarning)
return fun(*args, **kwargs)
# Add a reference to the wrapped function so that we can introspect
# on function arguments in Python 2 (already works in Python 3)
wrapped.__wrapped__ = fun
return wrapped
def _decorate_property(self, prop):
msg = self.extra
@property
@functools.wraps(prop)
def wrapped(*args, **kwargs):
warnings.warn(msg, category=FutureWarning)
return prop.fget(*args, **kwargs)
return wrapped
def _is_deprecated(func):
"""Helper to check if func is wrapped by our deprecated decorator"""
closures = getattr(func, "__closure__", [])
if closures is None:
closures = []
is_deprecated = "deprecated" in "".join(
[c.cell_contents for c in closures if isinstance(c.cell_contents, str)]
)
return is_deprecated
# TODO: remove in 1.7
def _deprecate_Xt_in_inverse_transform(X, Xt):
"""Helper to deprecate the `Xt` argument in favor of `X` in inverse_transform."""
if X is not None and Xt is not None:
raise TypeError("Cannot use both X and Xt. Use X only.")
if X is None and Xt is None:
raise TypeError("Missing required positional argument: X.")
if Xt is not None:
warnings.warn(
"Xt was renamed X in version 1.5 and will be removed in 1.7.",
FutureWarning,
)
return Xt
return X
@@ -0,0 +1,265 @@
"""
The :mod:`sklearn.utils.discovery` module includes utilities to discover
objects (i.e. estimators, displays, functions) from the `sklearn` package.
"""
import inspect
import pkgutil
from importlib import import_module
from operator import itemgetter
from pathlib import Path
_MODULE_TO_IGNORE = {
"tests",
"externals",
"setup",
"conftest",
"experimental",
"estimator_checks",
}
def all_estimators(type_filter=None):
"""Get a list of all estimators from `sklearn`.
This function crawls the module and gets all classes that inherit
from BaseEstimator. Classes that are defined in test-modules are not
included.
Parameters
----------
type_filter : {"classifier", "regressor", "cluster", "transformer"} \
or list of such str, default=None
Which kind of estimators should be returned. If None, no filter is
applied and all estimators are returned. Possible values are
'classifier', 'regressor', 'cluster' and 'transformer' to get
estimators only of these specific types, or a list of these to
get the estimators that fit at least one of the types.
Returns
-------
estimators : list of tuples
List of (name, class), where ``name`` is the class name as string
and ``class`` is the actual type of the class.
Examples
--------
>>> from sklearn.utils.discovery import all_estimators
>>> estimators = all_estimators()
>>> type(estimators)
<class 'list'>
>>> type(estimators[0])
<class 'tuple'>
>>> estimators[:2]
[('ARDRegression', <class 'sklearn.linear_model._bayes.ARDRegression'>),
('AdaBoostClassifier',
<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>)]
>>> classifiers = all_estimators(type_filter="classifier")
>>> classifiers[:2]
[('AdaBoostClassifier',
<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>),
('BaggingClassifier', <class 'sklearn.ensemble._bagging.BaggingClassifier'>)]
>>> regressors = all_estimators(type_filter="regressor")
>>> regressors[:2]
[('ARDRegression', <class 'sklearn.linear_model._bayes.ARDRegression'>),
('AdaBoostRegressor',
<class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>)]
>>> both = all_estimators(type_filter=["classifier", "regressor"])
>>> both[:2]
[('ARDRegression', <class 'sklearn.linear_model._bayes.ARDRegression'>),
('AdaBoostClassifier',
<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>)]
"""
# lazy import to avoid circular imports from sklearn.base
from ..base import (
BaseEstimator,
ClassifierMixin,
ClusterMixin,
RegressorMixin,
TransformerMixin,
)
from ._testing import ignore_warnings
from .fixes import _IS_PYPY
def is_abstract(c):
if not (hasattr(c, "__abstractmethods__")):
return False
if not len(c.__abstractmethods__):
return False
return True
all_classes = []
root = str(Path(__file__).parent.parent) # sklearn package
# Ignore deprecation warnings triggered at import time and from walking
# packages
with ignore_warnings(category=FutureWarning):
for _, module_name, _ in pkgutil.walk_packages(path=[root], prefix="sklearn."):
module_parts = module_name.split(".")
if (
any(part in _MODULE_TO_IGNORE for part in module_parts)
or "._" in module_name
):
continue
module = import_module(module_name)
classes = inspect.getmembers(module, inspect.isclass)
classes = [
(name, est_cls) for name, est_cls in classes if not name.startswith("_")
]
# TODO: Remove when FeatureHasher is implemented in PYPY
# Skips FeatureHasher for PYPY
if _IS_PYPY and "feature_extraction" in module_name:
classes = [
(name, est_cls)
for name, est_cls in classes
if name == "FeatureHasher"
]
all_classes.extend(classes)
all_classes = set(all_classes)
estimators = [
c
for c in all_classes
if (issubclass(c[1], BaseEstimator) and c[0] != "BaseEstimator")
]
# get rid of abstract base classes
estimators = [c for c in estimators if not is_abstract(c[1])]
if type_filter is not None:
if not isinstance(type_filter, list):
type_filter = [type_filter]
else:
type_filter = list(type_filter) # copy
filtered_estimators = []
filters = {
"classifier": ClassifierMixin,
"regressor": RegressorMixin,
"transformer": TransformerMixin,
"cluster": ClusterMixin,
}
for name, mixin in filters.items():
if name in type_filter:
type_filter.remove(name)
filtered_estimators.extend(
[est for est in estimators if issubclass(est[1], mixin)]
)
estimators = filtered_estimators
if type_filter:
raise ValueError(
"Parameter type_filter must be 'classifier', "
"'regressor', 'transformer', 'cluster' or "
"None, got"
f" {repr(type_filter)}."
)
# drop duplicates, sort for reproducibility
# itemgetter is used to ensure the sort does not extend to the 2nd item of
# the tuple
return sorted(set(estimators), key=itemgetter(0))
def all_displays():
"""Get a list of all displays from `sklearn`.
Returns
-------
displays : list of tuples
List of (name, class), where ``name`` is the display class name as
string and ``class`` is the actual type of the class.
Examples
--------
>>> from sklearn.utils.discovery import all_displays
>>> displays = all_displays()
>>> displays[0]
('CalibrationDisplay', <class 'sklearn.calibration.CalibrationDisplay'>)
"""
# lazy import to avoid circular imports from sklearn.base
from ._testing import ignore_warnings
all_classes = []
root = str(Path(__file__).parent.parent) # sklearn package
# Ignore deprecation warnings triggered at import time and from walking
# packages
with ignore_warnings(category=FutureWarning):
for _, module_name, _ in pkgutil.walk_packages(path=[root], prefix="sklearn."):
module_parts = module_name.split(".")
if (
any(part in _MODULE_TO_IGNORE for part in module_parts)
or "._" in module_name
):
continue
module = import_module(module_name)
classes = inspect.getmembers(module, inspect.isclass)
classes = [
(name, display_class)
for name, display_class in classes
if not name.startswith("_") and name.endswith("Display")
]
all_classes.extend(classes)
return sorted(set(all_classes), key=itemgetter(0))
def _is_checked_function(item):
if not inspect.isfunction(item):
return False
if item.__name__.startswith("_"):
return False
mod = item.__module__
if not mod.startswith("sklearn.") or mod.endswith("estimator_checks"):
return False
return True
def all_functions():
"""Get a list of all functions from `sklearn`.
Returns
-------
functions : list of tuples
List of (name, function), where ``name`` is the function name as
string and ``function`` is the actual function.
Examples
--------
>>> from sklearn.utils.discovery import all_functions
>>> functions = all_functions()
>>> name, function = functions[0]
>>> name
'accuracy_score'
"""
# lazy import to avoid circular imports from sklearn.base
from ._testing import ignore_warnings
all_functions = []
root = str(Path(__file__).parent.parent) # sklearn package
# Ignore deprecation warnings triggered at import time and from walking
# packages
with ignore_warnings(category=FutureWarning):
for _, module_name, _ in pkgutil.walk_packages(path=[root], prefix="sklearn."):
module_parts = module_name.split(".")
if (
any(part in _MODULE_TO_IGNORE for part in module_parts)
or "._" in module_name
):
continue
module = import_module(module_name)
functions = inspect.getmembers(module, _is_checked_function)
functions = [
(func.__name__, func)
for name, func in functions
if not name.startswith("_")
]
all_functions.extend(functions)
# drop duplicates, sort for reproducibility
# itemgetter is used to ensure the sort does not extend to the 2nd item of
# the tuple
return sorted(set(all_functions), key=itemgetter(0))
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,417 @@
"""Compatibility fixes for older version of python, numpy and scipy
If you add content to this file, please give the version of the package
at which the fix is no longer needed.
"""
# Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Fabian Pedregosa <fpedregosa@acm.org>
# Lars Buitinck
#
# License: BSD 3 clause
import platform
import struct
import numpy as np
import scipy
import scipy.sparse.linalg
import scipy.stats
import sklearn
from ..externals._packaging.version import parse as parse_version
_IS_PYPY = platform.python_implementation() == "PyPy"
_IS_32BIT = 8 * struct.calcsize("P") == 32
_IS_WASM = platform.machine() in ["wasm32", "wasm64"]
np_version = parse_version(np.__version__)
np_base_version = parse_version(np_version.base_version)
sp_version = parse_version(scipy.__version__)
sp_base_version = parse_version(sp_version.base_version)
# TODO: We can consider removing the containers and importing
# directly from SciPy when sparse matrices will be deprecated.
CSR_CONTAINERS = [scipy.sparse.csr_matrix]
CSC_CONTAINERS = [scipy.sparse.csc_matrix]
COO_CONTAINERS = [scipy.sparse.coo_matrix]
LIL_CONTAINERS = [scipy.sparse.lil_matrix]
DOK_CONTAINERS = [scipy.sparse.dok_matrix]
BSR_CONTAINERS = [scipy.sparse.bsr_matrix]
DIA_CONTAINERS = [scipy.sparse.dia_matrix]
if parse_version(scipy.__version__) >= parse_version("1.8"):
# Sparse Arrays have been added in SciPy 1.8
# TODO: When SciPy 1.8 is the minimum supported version,
# those list can be created directly without this condition.
# See: https://github.com/scikit-learn/scikit-learn/issues/27090
CSR_CONTAINERS.append(scipy.sparse.csr_array)
CSC_CONTAINERS.append(scipy.sparse.csc_array)
COO_CONTAINERS.append(scipy.sparse.coo_array)
LIL_CONTAINERS.append(scipy.sparse.lil_array)
DOK_CONTAINERS.append(scipy.sparse.dok_array)
BSR_CONTAINERS.append(scipy.sparse.bsr_array)
DIA_CONTAINERS.append(scipy.sparse.dia_array)
# Remove when minimum scipy version is 1.11.0
try:
from scipy.sparse import sparray # noqa
SPARRAY_PRESENT = True
except ImportError:
SPARRAY_PRESENT = False
# Remove when minimum scipy version is 1.8
try:
from scipy.sparse import csr_array # noqa
SPARSE_ARRAY_PRESENT = True
except ImportError:
SPARSE_ARRAY_PRESENT = False
try:
from scipy.optimize._linesearch import line_search_wolfe1, line_search_wolfe2
except ImportError: # SciPy < 1.8
from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1 # type: ignore # noqa
def _object_dtype_isnan(X):
return X != X
# Rename the `method` kwarg to `interpolation` for NumPy < 1.22, because
# `interpolation` kwarg was deprecated in favor of `method` in NumPy >= 1.22.
def _percentile(a, q, *, method="linear", **kwargs):
return np.percentile(a, q, interpolation=method, **kwargs)
if np_version < parse_version("1.22"):
percentile = _percentile
else: # >= 1.22
from numpy import percentile # type: ignore # noqa
# TODO: Remove when SciPy 1.11 is the minimum supported version
def _mode(a, axis=0):
if sp_version >= parse_version("1.9.0"):
mode = scipy.stats.mode(a, axis=axis, keepdims=True)
if sp_version >= parse_version("1.10.999"):
# scipy.stats.mode has changed returned array shape with axis=None
# and keepdims=True, see https://github.com/scipy/scipy/pull/17561
if axis is None:
mode = np.ravel(mode)
return mode
return scipy.stats.mode(a, axis=axis)
# TODO: Remove when Scipy 1.12 is the minimum supported version
if sp_base_version >= parse_version("1.12.0"):
_sparse_linalg_cg = scipy.sparse.linalg.cg
else:
def _sparse_linalg_cg(A, b, **kwargs):
if "rtol" in kwargs:
kwargs["tol"] = kwargs.pop("rtol")
if "atol" not in kwargs:
kwargs["atol"] = "legacy"
return scipy.sparse.linalg.cg(A, b, **kwargs)
# TODO: Fuse the modern implementations of _sparse_min_max and _sparse_nan_min_max
# into the public min_max_axis function when Scipy 1.11 is the minimum supported
# version and delete the backport in the else branch below.
if sp_base_version >= parse_version("1.11.0"):
def _sparse_min_max(X, axis):
the_min = X.min(axis=axis)
the_max = X.max(axis=axis)
if axis is not None:
the_min = the_min.toarray().ravel()
the_max = the_max.toarray().ravel()
return the_min, the_max
def _sparse_nan_min_max(X, axis):
the_min = X.nanmin(axis=axis)
the_max = X.nanmax(axis=axis)
if axis is not None:
the_min = the_min.toarray().ravel()
the_max = the_max.toarray().ravel()
return the_min, the_max
else:
# This code is mostly taken from scipy 0.14 and extended to handle nans, see
# https://github.com/scikit-learn/scikit-learn/pull/11196
def _minor_reduce(X, ufunc):
major_index = np.flatnonzero(np.diff(X.indptr))
# reduceat tries casts X.indptr to intp, which errors
# if it is int64 on a 32 bit system.
# Reinitializing prevents this where possible, see #13737
X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
value = ufunc.reduceat(X.data, X.indptr[major_index])
return major_index, value
def _min_or_max_axis(X, axis, min_or_max):
N = X.shape[axis]
if N == 0:
raise ValueError("zero-size array to reduction operation")
M = X.shape[1 - axis]
mat = X.tocsc() if axis == 0 else X.tocsr()
mat.sum_duplicates()
major_index, value = _minor_reduce(mat, min_or_max)
not_full = np.diff(mat.indptr)[major_index] < N
value[not_full] = min_or_max(value[not_full], 0)
mask = value != 0
major_index = np.compress(mask, major_index)
value = np.compress(mask, value)
if axis == 0:
res = scipy.sparse.coo_matrix(
(value, (np.zeros(len(value)), major_index)),
dtype=X.dtype,
shape=(1, M),
)
else:
res = scipy.sparse.coo_matrix(
(value, (major_index, np.zeros(len(value)))),
dtype=X.dtype,
shape=(M, 1),
)
return res.A.ravel()
def _sparse_min_or_max(X, axis, min_or_max):
if axis is None:
if 0 in X.shape:
raise ValueError("zero-size array to reduction operation")
zero = X.dtype.type(0)
if X.nnz == 0:
return zero
m = min_or_max.reduce(X.data.ravel())
if X.nnz != np.prod(X.shape):
m = min_or_max(zero, m)
return m
if axis < 0:
axis += 2
if (axis == 0) or (axis == 1):
return _min_or_max_axis(X, axis, min_or_max)
else:
raise ValueError("invalid axis, use 0 for rows, or 1 for columns")
def _sparse_min_max(X, axis):
return (
_sparse_min_or_max(X, axis, np.minimum),
_sparse_min_or_max(X, axis, np.maximum),
)
def _sparse_nan_min_max(X, axis):
return (
_sparse_min_or_max(X, axis, np.fmin),
_sparse_min_or_max(X, axis, np.fmax),
)
# For +1.25 NumPy versions exceptions and warnings are being moved
# to a dedicated submodule.
if np_version >= parse_version("1.25.0"):
from numpy.exceptions import ComplexWarning, VisibleDeprecationWarning
else:
from numpy import ComplexWarning, VisibleDeprecationWarning # type: ignore # noqa
# TODO: Remove when Scipy 1.6 is the minimum supported version
try:
from scipy.integrate import trapezoid # type: ignore # noqa
except ImportError:
from scipy.integrate import trapz as trapezoid # type: ignore # noqa
# TODO: Adapt when Pandas > 2.2 is the minimum supported version
def pd_fillna(pd, frame):
pd_version = parse_version(pd.__version__).base_version
if parse_version(pd_version) < parse_version("2.2"):
frame = frame.fillna(value=np.nan)
else:
infer_objects_kwargs = (
{} if parse_version(pd_version) >= parse_version("3") else {"copy": False}
)
with pd.option_context("future.no_silent_downcasting", True):
frame = frame.fillna(value=np.nan).infer_objects(**infer_objects_kwargs)
return frame
# TODO: remove when SciPy 1.12 is the minimum supported version
def _preserve_dia_indices_dtype(
sparse_container, original_container_format, requested_sparse_format
):
"""Preserve indices dtype for SciPy < 1.12 when converting from DIA to CSR/CSC.
For SciPy < 1.12, DIA arrays indices are upcasted to `np.int64` that is
inconsistent with DIA matrices. We downcast the indices dtype to `np.int32` to
be consistent with DIA matrices.
The converted indices arrays are affected back inplace to the sparse container.
Parameters
----------
sparse_container : sparse container
Sparse container to be checked.
requested_sparse_format : str or bool
The type of format of `sparse_container`.
Notes
-----
See https://github.com/scipy/scipy/issues/19245 for more details.
"""
if original_container_format == "dia_array" and requested_sparse_format in (
"csr",
"coo",
):
if requested_sparse_format == "csr":
index_dtype = _smallest_admissible_index_dtype(
arrays=(sparse_container.indptr, sparse_container.indices),
maxval=max(sparse_container.nnz, sparse_container.shape[1]),
check_contents=True,
)
sparse_container.indices = sparse_container.indices.astype(
index_dtype, copy=False
)
sparse_container.indptr = sparse_container.indptr.astype(
index_dtype, copy=False
)
else: # requested_sparse_format == "coo"
index_dtype = _smallest_admissible_index_dtype(
maxval=max(sparse_container.shape)
)
sparse_container.row = sparse_container.row.astype(index_dtype, copy=False)
sparse_container.col = sparse_container.col.astype(index_dtype, copy=False)
# TODO: remove when SciPy 1.12 is the minimum supported version
def _smallest_admissible_index_dtype(arrays=(), maxval=None, check_contents=False):
"""Based on input (integer) arrays `a`, determine a suitable index data
type that can hold the data in the arrays.
This function returns `np.int64` if it either required by `maxval` or based on the
largest precision of the dtype of the arrays passed as argument, or by the their
contents (when `check_contents is True`). If none of the condition requires
`np.int64` then this function returns `np.int32`.
Parameters
----------
arrays : ndarray or tuple of ndarrays, default=()
Input arrays whose types/contents to check.
maxval : float, default=None
Maximum value needed.
check_contents : bool, default=False
Whether to check the values in the arrays and not just their types.
By default, check only the types.
Returns
-------
dtype : {np.int32, np.int64}
Suitable index data type (int32 or int64).
"""
int32min = np.int32(np.iinfo(np.int32).min)
int32max = np.int32(np.iinfo(np.int32).max)
if maxval is not None:
if maxval > np.iinfo(np.int64).max:
raise ValueError(
f"maxval={maxval} is to large to be represented as np.int64."
)
if maxval > int32max:
return np.int64
if isinstance(arrays, np.ndarray):
arrays = (arrays,)
for arr in arrays:
if not isinstance(arr, np.ndarray):
raise TypeError(
f"Arrays should be of type np.ndarray, got {type(arr)} instead."
)
if not np.issubdtype(arr.dtype, np.integer):
raise ValueError(
f"Array dtype {arr.dtype} is not supported for index dtype. We expect "
"integral values."
)
if not np.can_cast(arr.dtype, np.int32):
if not check_contents:
# when `check_contents` is False, we stay on the safe side and return
# np.int64.
return np.int64
if arr.size == 0:
# a bigger type not needed yet, let's look at the next array
continue
else:
maxval = arr.max()
minval = arr.min()
if minval < int32min or maxval > int32max:
# a big index type is actually needed
return np.int64
return np.int32
# TODO: Remove when Scipy 1.12 is the minimum supported version
if sp_version < parse_version("1.12"):
from ..externals._scipy.sparse.csgraph import laplacian # type: ignore # noqa
else:
from scipy.sparse.csgraph import laplacian # type: ignore # noqa # pragma: no cover
# TODO: Remove when we drop support for Python 3.9. Note the filter argument has
# been back-ported in 3.9.17 but we can not assume anything about the micro
# version, see
# https://docs.python.org/3.9/library/tarfile.html#tarfile.TarFile.extractall
# for more details
def tarfile_extractall(tarfile, path):
try:
tarfile.extractall(path, filter="data")
except TypeError:
tarfile.extractall(path)
def _in_unstable_openblas_configuration():
"""Return True if in an unstable configuration for OpenBLAS"""
# Import libraries which might load OpenBLAS.
import numpy # noqa
import scipy # noqa
modules_info = sklearn._threadpool_controller.info()
open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
if not open_blas_used:
return False
# OpenBLAS 0.3.16 fixed instability for arm64, see:
# https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa
openblas_arm64_stable_version = parse_version("0.3.16")
for info in modules_info:
if info["internal_api"] != "openblas":
continue
openblas_version = info.get("version")
openblas_architecture = info.get("architecture")
if openblas_version is None or openblas_architecture is None:
# Cannot be sure that OpenBLAS is good enough. Assume unstable:
return True # pragma: no cover
if (
openblas_architecture == "neoversen1"
and parse_version(openblas_version) < openblas_arm64_stable_version
):
# See discussions in https://github.com/numpy/numpy/issues/19411
return True # pragma: no cover
return False
@@ -0,0 +1,166 @@
"""
The :mod:`sklearn.utils.graph` module includes graph utilities and algorithms.
"""
# Authors: Aric Hagberg <hagberg@lanl.gov>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Jake Vanderplas <vanderplas@astro.washington.edu>
# License: BSD 3 clause
import numpy as np
from scipy import sparse
from ..metrics.pairwise import pairwise_distances
from ._param_validation import Integral, Interval, validate_params
###############################################################################
# Path and connected component analysis.
# Code adapted from networkx
@validate_params(
{
"graph": ["array-like", "sparse matrix"],
"source": [Interval(Integral, 0, None, closed="left")],
"cutoff": [Interval(Integral, 0, None, closed="left"), None],
},
prefer_skip_nested_validation=True,
)
def single_source_shortest_path_length(graph, source, *, cutoff=None):
"""Return the length of the shortest path from source to all reachable nodes.
Parameters
----------
graph : {array-like, sparse matrix} of shape (n_nodes, n_nodes)
Adjacency matrix of the graph. Sparse matrix of format LIL is
preferred.
source : int
Start node for path.
cutoff : int, default=None
Depth to stop the search - only paths of length <= cutoff are returned.
Returns
-------
paths : dict
Reachable end nodes mapped to length of path from source,
i.e. `{end: path_length}`.
Examples
--------
>>> from sklearn.utils.graph import single_source_shortest_path_length
>>> import numpy as np
>>> graph = np.array([[ 0, 1, 0, 0],
... [ 1, 0, 1, 0],
... [ 0, 1, 0, 0],
... [ 0, 0, 0, 0]])
>>> single_source_shortest_path_length(graph, 0)
{0: 0, 1: 1, 2: 2}
>>> graph = np.ones((6, 6))
>>> sorted(single_source_shortest_path_length(graph, 2).items())
[(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1)]
"""
if sparse.issparse(graph):
graph = graph.tolil()
else:
graph = sparse.lil_matrix(graph)
seen = {} # level (number of hops) when seen in BFS
level = 0 # the current level
next_level = [source] # dict of nodes to check at next level
while next_level:
this_level = next_level # advance to next level
next_level = set() # and start a new list (fringe)
for v in this_level:
if v not in seen:
seen[v] = level # set the level of vertex v
next_level.update(graph.rows[v])
if cutoff is not None and cutoff <= level:
break
level += 1
return seen # return all path lengths as dictionary
def _fix_connected_components(
X,
graph,
n_connected_components,
component_labels,
mode="distance",
metric="euclidean",
**kwargs,
):
"""Add connections to sparse graph to connect unconnected components.
For each pair of unconnected components, compute all pairwise distances
from one component to the other, and add a connection on the closest pair
of samples. This is a hacky way to get a graph with a single connected
component, which is necessary for example to compute a shortest path
between all pairs of samples in the graph.
Parameters
----------
X : array of shape (n_samples, n_features) or (n_samples, n_samples)
Features to compute the pairwise distances. If `metric =
"precomputed"`, X is the matrix of pairwise distances.
graph : sparse matrix of shape (n_samples, n_samples)
Graph of connection between samples.
n_connected_components : int
Number of connected components, as computed by
`scipy.sparse.csgraph.connected_components`.
component_labels : array of shape (n_samples)
Labels of connected components, as computed by
`scipy.sparse.csgraph.connected_components`.
mode : {'connectivity', 'distance'}, default='distance'
Type of graph matrix: 'connectivity' corresponds to the connectivity
matrix with ones and zeros, and 'distance' corresponds to the distances
between neighbors according to the given metric.
metric : str
Metric used in `sklearn.metrics.pairwise.pairwise_distances`.
kwargs : kwargs
Keyword arguments passed to
`sklearn.metrics.pairwise.pairwise_distances`.
Returns
-------
graph : sparse matrix of shape (n_samples, n_samples)
Graph of connection between samples, with a single connected component.
"""
if metric == "precomputed" and sparse.issparse(X):
raise RuntimeError(
"_fix_connected_components with metric='precomputed' requires the "
"full distance matrix in X, and does not work with a sparse "
"neighbors graph."
)
for i in range(n_connected_components):
idx_i = np.flatnonzero(component_labels == i)
Xi = X[idx_i]
for j in range(i):
idx_j = np.flatnonzero(component_labels == j)
Xj = X[idx_j]
if metric == "precomputed":
D = X[np.ix_(idx_i, idx_j)]
else:
D = pairwise_distances(Xi, Xj, metric=metric, **kwargs)
ii, jj = np.unravel_index(D.argmin(axis=None), D.shape)
if mode == "connectivity":
graph[idx_i[ii], idx_j[jj]] = 1
graph[idx_j[jj], idx_i[ii]] = 1
elif mode == "distance":
graph[idx_i[ii], idx_j[jj]] = D[ii, jj]
graph[idx_j[jj], idx_i[ii]] = D[ii, jj]
else:
raise ValueError(
"Unknown mode=%r, should be one of ['connectivity', 'distance']."
% mode
)
return graph
@@ -0,0 +1,74 @@
# utils is cimported from other subpackages so this is needed for the cimport
# to work
utils_cython_tree = [
# We add sklearn_root_cython_tree to make sure sklearn/__init__.py is copied
# early in the build
sklearn_root_cython_tree,
fs.copyfile('__init__.py'),
fs.copyfile('_cython_blas.pxd'),
fs.copyfile('_heap.pxd'),
fs.copyfile('_openmp_helpers.pxd'),
fs.copyfile('_random.pxd'),
fs.copyfile('_sorting.pxd'),
fs.copyfile('_typedefs.pxd'),
fs.copyfile('_vector_sentinel.pxd'),
]
utils_extension_metadata = {
'sparsefuncs_fast':
{'sources': ['sparsefuncs_fast.pyx']},
'_cython_blas': {'sources': ['_cython_blas.pyx']},
'arrayfuncs': {'sources': ['arrayfuncs.pyx']},
'murmurhash': {
'sources': ['murmurhash.pyx', 'src' / 'MurmurHash3.cpp'],
},
'_fast_dict':
{'sources': ['_fast_dict.pyx'], 'override_options': ['cython_language=cpp']},
'_openmp_helpers': {'sources': ['_openmp_helpers.pyx'], 'dependencies': [openmp_dep]},
'_random': {'sources': ['_random.pyx']},
'_typedefs': {'sources': ['_typedefs.pyx']},
'_heap': {'sources': ['_heap.pyx']},
'_sorting': {'sources': ['_sorting.pyx']},
'_vector_sentinel':
{'sources': ['_vector_sentinel.pyx'], 'override_options': ['cython_language=cpp'],
'dependencies': [np_dep]},
'_isfinite': {'sources': ['_isfinite.pyx']},
}
foreach ext_name, ext_dict : utils_extension_metadata
py.extension_module(
ext_name,
[ext_dict.get('sources'), utils_cython_tree],
dependencies: ext_dict.get('dependencies', []),
override_options : ext_dict.get('override_options', []),
cython_args: cython_args,
subdir: 'sklearn/utils',
install: true
)
endforeach
util_extension_names = ['_seq_dataset', '_weight_vector']
foreach name: util_extension_names
pxd = custom_target(
name + '_pxd',
output: name + '.pxd',
input: name + '.pxd.tp',
command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'],
)
utils_cython_tree += [pxd]
pyx = custom_target(
name + '_pyx',
output: name + '.pyx',
input: name + '.pyx.tp',
command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@']
)
py.extension_module(
name,
[pxd, pyx, utils_cython_tree],
cython_args: cython_args,
subdir: 'sklearn/utils',
install: true
)
endforeach
@@ -0,0 +1,22 @@
"""
The :mod:`sklearn.utils.metadata_routing` module includes utilities to route
metadata within scikit-learn estimators.
"""
# This module is not a separate sub-folder since that would result in a circular
# import issue.
#
# Author: Adrin Jalali <adrin.jalali@gmail.com>
# License: BSD 3 clause
from ._metadata_requests import WARN, UNUSED, UNCHANGED # noqa
from ._metadata_requests import get_routing_for_object # noqa
from ._metadata_requests import MetadataRouter # noqa
from ._metadata_requests import MetadataRequest # noqa
from ._metadata_requests import MethodMapping # noqa
from ._metadata_requests import process_routing # noqa
from ._metadata_requests import _MetadataRequester # noqa
from ._metadata_requests import _routing_enabled # noqa
from ._metadata_requests import _raise_for_params # noqa
from ._metadata_requests import _RoutingNotSupportedMixin # noqa
from ._metadata_requests import _raise_for_unsupported_routing # noqa
@@ -0,0 +1,165 @@
"""
The :mod:`sklearn.utils.metaestimators` module includes utilities for meta-estimators.
"""
# Author: Joel Nothman
# Andreas Mueller
# License: BSD
from abc import ABCMeta, abstractmethod
from contextlib import suppress
from typing import Any, List
import numpy as np
from ..base import BaseEstimator
from ..utils import _safe_indexing
from ..utils._tags import _safe_tags
from ._available_if import available_if
__all__ = ["available_if"]
class _BaseComposition(BaseEstimator, metaclass=ABCMeta):
"""Handles parameter management for classifiers composed of named estimators."""
steps: List[Any]
@abstractmethod
def __init__(self):
pass
def _get_params(self, attr, deep=True):
out = super().get_params(deep=deep)
if not deep:
return out
estimators = getattr(self, attr)
try:
out.update(estimators)
except (TypeError, ValueError):
# Ignore TypeError for cases where estimators is not a list of
# (name, estimator) and ignore ValueError when the list is not
# formatted correctly. This is to prevent errors when calling
# `set_params`. `BaseEstimator.set_params` calls `get_params` which
# can error for invalid values for `estimators`.
return out
for name, estimator in estimators:
if hasattr(estimator, "get_params"):
for key, value in estimator.get_params(deep=True).items():
out["%s__%s" % (name, key)] = value
return out
def _set_params(self, attr, **params):
# Ensure strict ordering of parameter setting:
# 1. All steps
if attr in params:
setattr(self, attr, params.pop(attr))
# 2. Replace items with estimators in params
items = getattr(self, attr)
if isinstance(items, list) and items:
# Get item names used to identify valid names in params
# `zip` raises a TypeError when `items` does not contains
# elements of length 2
with suppress(TypeError):
item_names, _ = zip(*items)
for name in list(params.keys()):
if "__" not in name and name in item_names:
self._replace_estimator(attr, name, params.pop(name))
# 3. Step parameters and other initialisation arguments
super().set_params(**params)
return self
def _replace_estimator(self, attr, name, new_val):
# assumes `name` is a valid estimator name
new_estimators = list(getattr(self, attr))
for i, (estimator_name, _) in enumerate(new_estimators):
if estimator_name == name:
new_estimators[i] = (name, new_val)
break
setattr(self, attr, new_estimators)
def _validate_names(self, names):
if len(set(names)) != len(names):
raise ValueError("Names provided are not unique: {0!r}".format(list(names)))
invalid_names = set(names).intersection(self.get_params(deep=False))
if invalid_names:
raise ValueError(
"Estimator names conflict with constructor arguments: {0!r}".format(
sorted(invalid_names)
)
)
invalid_names = [name for name in names if "__" in name]
if invalid_names:
raise ValueError(
"Estimator names must not contain __: got {0!r}".format(invalid_names)
)
def _safe_split(estimator, X, y, indices, train_indices=None):
"""Create subset of dataset and properly handle kernels.
Slice X, y according to indices for cross-validation, but take care of
precomputed kernel-matrices or pairwise affinities / distances.
If ``estimator._pairwise is True``, X needs to be square and
we slice rows and columns. If ``train_indices`` is not None,
we slice rows using ``indices`` (assumed the test set) and columns
using ``train_indices``, indicating the training set.
Labels y will always be indexed only along the first axis.
Parameters
----------
estimator : object
Estimator to determine whether we should slice only rows or rows and
columns.
X : array-like, sparse matrix or iterable
Data to be indexed. If ``estimator._pairwise is True``,
this needs to be a square array-like or sparse matrix.
y : array-like, sparse matrix or iterable
Targets to be indexed.
indices : array of int
Rows to select from X and y.
If ``estimator._pairwise is True`` and ``train_indices is None``
then ``indices`` will also be used to slice columns.
train_indices : array of int or None, default=None
If ``estimator._pairwise is True`` and ``train_indices is not None``,
then ``train_indices`` will be use to slice the columns of X.
Returns
-------
X_subset : array-like, sparse matrix or list
Indexed data.
y_subset : array-like, sparse matrix or list
Indexed targets.
"""
if _safe_tags(estimator, key="pairwise"):
if not hasattr(X, "shape"):
raise ValueError(
"Precomputed kernels or affinity matrices have "
"to be passed as arrays or sparse matrices."
)
# X is a precomputed square kernel matrix
if X.shape[0] != X.shape[1]:
raise ValueError("X should be a square kernel matrix")
if train_indices is None:
X_subset = X[np.ix_(indices, indices)]
else:
X_subset = X[np.ix_(indices, train_indices)]
else:
X_subset = _safe_indexing(X, indices)
if y is not None:
y_subset = _safe_indexing(y, indices)
else:
y_subset = None
return X_subset, y_subset
@@ -0,0 +1,564 @@
"""
The :mod:`sklearn.utils.multiclass` module includes utilities to handle
multiclass/multioutput target in classifiers.
"""
# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi
#
# License: BSD 3 clause
import warnings
from collections.abc import Sequence
from itertools import chain
import numpy as np
from scipy.sparse import issparse
from ..utils._array_api import get_namespace
from ..utils.fixes import VisibleDeprecationWarning
from .validation import _assert_all_finite, check_array
def _unique_multiclass(y):
xp, is_array_api_compliant = get_namespace(y)
if hasattr(y, "__array__") or is_array_api_compliant:
return xp.unique_values(xp.asarray(y))
else:
return set(y)
def _unique_indicator(y):
xp, _ = get_namespace(y)
return xp.arange(
check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1]
)
_FN_UNIQUE_LABELS = {
"binary": _unique_multiclass,
"multiclass": _unique_multiclass,
"multilabel-indicator": _unique_indicator,
}
def unique_labels(*ys):
"""Extract an ordered array of unique labels.
We don't allow:
- mix of multilabel and multiclass (single label) targets
- mix of label indicator matrix and anything else,
because there are no explicit labels)
- mix of label indicator matrices of different sizes
- mix of string and integer labels
At the moment, we also don't allow "multiclass-multioutput" input type.
Parameters
----------
*ys : array-likes
Label values.
Returns
-------
out : ndarray of shape (n_unique_labels,)
An ordered array of unique labels.
Examples
--------
>>> from sklearn.utils.multiclass import unique_labels
>>> unique_labels([3, 5, 5, 5, 7, 7])
array([3, 5, 7])
>>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
array([1, 2, 3, 4])
>>> unique_labels([1, 2, 10], [5, 11])
array([ 1, 2, 5, 10, 11])
"""
xp, is_array_api_compliant = get_namespace(*ys)
if not ys:
raise ValueError("No argument has been passed.")
# Check that we don't mix label format
ys_types = set(type_of_target(x) for x in ys)
if ys_types == {"binary", "multiclass"}:
ys_types = {"multiclass"}
if len(ys_types) > 1:
raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
label_type = ys_types.pop()
# Check consistency for the indicator format
if (
label_type == "multilabel-indicator"
and len(
set(
check_array(y, accept_sparse=["csr", "csc", "coo"]).shape[1] for y in ys
)
)
> 1
):
raise ValueError(
"Multi-label binary indicator input with different numbers of labels"
)
# Get the unique set of labels
_unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
if not _unique_labels:
raise ValueError("Unknown label type: %s" % repr(ys))
if is_array_api_compliant:
# array_api does not allow for mixed dtypes
unique_ys = xp.concat([_unique_labels(y) for y in ys])
return xp.unique_values(unique_ys)
ys_labels = set(chain.from_iterable((i for i in _unique_labels(y)) for y in ys))
# Check that we don't mix string type with number type
if len(set(isinstance(label, str) for label in ys_labels)) > 1:
raise ValueError("Mix of label input types (string and number)")
return xp.asarray(sorted(ys_labels))
def _is_integral_float(y):
xp, is_array_api_compliant = get_namespace(y)
return xp.isdtype(y.dtype, "real floating") and bool(
xp.all(xp.astype((xp.astype(y, xp.int64)), y.dtype) == y)
)
def is_multilabel(y):
"""Check if ``y`` is in a multilabel format.
Parameters
----------
y : ndarray of shape (n_samples,)
Target values.
Returns
-------
out : bool
Return ``True``, if ``y`` is in a multilabel format, else ```False``.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils.multiclass import is_multilabel
>>> is_multilabel([0, 1, 0, 1])
False
>>> is_multilabel([[1], [0, 2], []])
False
>>> is_multilabel(np.array([[1, 0], [0, 0]]))
True
>>> is_multilabel(np.array([[1], [0], [0]]))
False
>>> is_multilabel(np.array([[1, 0, 0]]))
True
"""
xp, is_array_api_compliant = get_namespace(y)
if hasattr(y, "__array__") or isinstance(y, Sequence) or is_array_api_compliant:
# DeprecationWarning will be replaced by ValueError, see NEP 34
# https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
check_y_kwargs = dict(
accept_sparse=True,
allow_nd=True,
force_all_finite=False,
ensure_2d=False,
ensure_min_samples=0,
ensure_min_features=0,
)
with warnings.catch_warnings():
warnings.simplefilter("error", VisibleDeprecationWarning)
try:
y = check_array(y, dtype=None, **check_y_kwargs)
except (VisibleDeprecationWarning, ValueError) as e:
if str(e).startswith("Complex data not supported"):
raise
# dtype=object should be provided explicitly for ragged arrays,
# see NEP 34
y = check_array(y, dtype=object, **check_y_kwargs)
if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
return False
if issparse(y):
if y.format in ("dok", "lil"):
y = y.tocsr()
labels = xp.unique_values(y.data)
return (
len(y.data) == 0
or (labels.size == 1 or (labels.size == 2) and (0 in labels))
and (y.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint
)
else:
labels = xp.unique_values(y)
return labels.shape[0] < 3 and (
xp.isdtype(y.dtype, ("bool", "signed integer", "unsigned integer"))
or _is_integral_float(labels)
)
def check_classification_targets(y):
"""Ensure that target y is of a non-regression type.
Only the following target types (as defined in type_of_target) are allowed:
'binary', 'multiclass', 'multiclass-multioutput',
'multilabel-indicator', 'multilabel-sequences'
Parameters
----------
y : array-like
Target values.
"""
y_type = type_of_target(y, input_name="y")
if y_type not in [
"binary",
"multiclass",
"multiclass-multioutput",
"multilabel-indicator",
"multilabel-sequences",
]:
raise ValueError(
f"Unknown label type: {y_type}. Maybe you are trying to fit a "
"classifier, which expects discrete classes on a "
"regression target with continuous values."
)
def type_of_target(y, input_name=""):
"""Determine the type of data indicated by the target.
Note that this type is the most specific type that can be inferred.
For example:
* ``binary`` is more specific but compatible with ``multiclass``.
* ``multiclass`` of integers is more specific but compatible with
``continuous``.
* ``multilabel-indicator`` is more specific but compatible with
``multiclass-multioutput``.
Parameters
----------
y : {array-like, sparse matrix}
Target values. If a sparse matrix, `y` is expected to be a
CSR/CSC matrix.
input_name : str, default=""
The data name used to construct the error message.
.. versionadded:: 1.1.0
Returns
-------
target_type : str
One of:
* 'continuous': `y` is an array-like of floats that are not all
integers, and is 1d or a column vector.
* 'continuous-multioutput': `y` is a 2d array of floats that are
not all integers, and both dimensions are of size > 1.
* 'binary': `y` contains <= 2 discrete values and is 1d or a column
vector.
* 'multiclass': `y` contains more than two discrete values, is not a
sequence of sequences, and is 1d or a column vector.
* 'multiclass-multioutput': `y` is a 2d array that contains more
than two discrete values, is not a sequence of sequences, and both
dimensions are of size > 1.
* 'multilabel-indicator': `y` is a label indicator matrix, an array
of two dimensions with at least two columns, and at most 2 unique
values.
* 'unknown': `y` is array-like but none of the above, such as a 3d
array, sequence of sequences, or an array of non-sequence objects.
Examples
--------
>>> from sklearn.utils.multiclass import type_of_target
>>> import numpy as np
>>> type_of_target([0.1, 0.6])
'continuous'
>>> type_of_target([1, -1, -1, 1])
'binary'
>>> type_of_target(['a', 'b', 'a'])
'binary'
>>> type_of_target([1.0, 2.0])
'binary'
>>> type_of_target([1, 0, 2])
'multiclass'
>>> type_of_target([1.0, 0.0, 3.0])
'multiclass'
>>> type_of_target(['a', 'b', 'c'])
'multiclass'
>>> type_of_target(np.array([[1, 2], [3, 1]]))
'multiclass-multioutput'
>>> type_of_target([[1, 2]])
'multilabel-indicator'
>>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
'continuous-multioutput'
>>> type_of_target(np.array([[0, 1], [1, 1]]))
'multilabel-indicator'
"""
xp, is_array_api_compliant = get_namespace(y)
valid = (
(isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__"))
and not isinstance(y, str)
or is_array_api_compliant
)
if not valid:
raise ValueError(
"Expected array-like (array or non-string sequence), got %r" % y
)
sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"]
if sparse_pandas:
raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
if is_multilabel(y):
return "multilabel-indicator"
# DeprecationWarning will be replaced by ValueError, see NEP 34
# https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
# We therefore catch both deprecation (NumPy < 1.24) warning and
# value error (NumPy >= 1.24).
check_y_kwargs = dict(
accept_sparse=True,
allow_nd=True,
force_all_finite=False,
ensure_2d=False,
ensure_min_samples=0,
ensure_min_features=0,
)
with warnings.catch_warnings():
warnings.simplefilter("error", VisibleDeprecationWarning)
if not issparse(y):
try:
y = check_array(y, dtype=None, **check_y_kwargs)
except (VisibleDeprecationWarning, ValueError) as e:
if str(e).startswith("Complex data not supported"):
raise
# dtype=object should be provided explicitly for ragged arrays,
# see NEP 34
y = check_array(y, dtype=object, **check_y_kwargs)
try:
# TODO(1.7): Change to ValueError when byte labels is deprecated.
# labels in bytes format
first_row_or_val = y[[0], :] if issparse(y) else y[0]
if isinstance(first_row_or_val, bytes):
warnings.warn(
(
"Support for labels represented as bytes is deprecated in v1.5 and"
" will error in v1.7. Convert the labels to a string or integer"
" format."
),
FutureWarning,
)
# The old sequence of sequences format
if (
not hasattr(first_row_or_val, "__array__")
and isinstance(first_row_or_val, Sequence)
and not isinstance(first_row_or_val, str)
):
raise ValueError(
"You appear to be using a legacy multi-label data"
" representation. Sequence of sequences are no"
" longer supported; use a binary array or sparse"
" matrix instead - the MultiLabelBinarizer"
" transformer can convert to this format."
)
except IndexError:
pass
# Invalid inputs
if y.ndim not in (1, 2):
# Number of dimension greater than 2: [[[1, 2]]]
return "unknown"
if not min(y.shape):
# Empty ndarray: []/[[]]
if y.ndim == 1:
# 1-D empty array: []
return "binary" # []
# 2-D empty array: [[]]
return "unknown"
if not issparse(y) and y.dtype == object and not isinstance(y.flat[0], str):
# [obj_1] and not ["label_1"]
return "unknown"
# Check if multioutput
if y.ndim == 2 and y.shape[1] > 1:
suffix = "-multioutput" # [[1, 2], [1, 2]]
else:
suffix = "" # [1, 2, 3] or [[1], [2], [3]]
# Check float and contains non-integer float values
if xp.isdtype(y.dtype, "real floating"):
# [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
data = y.data if issparse(y) else y
if xp.any(data != xp.astype(data, int)):
_assert_all_finite(data, input_name=input_name)
return "continuous" + suffix
# Check multiclass
if issparse(first_row_or_val):
first_row_or_val = first_row_or_val.data
if xp.unique_values(y).shape[0] > 2 or (y.ndim == 2 and len(first_row_or_val) > 1):
# [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
return "multiclass" + suffix
else:
return "binary" # [1, 2] or [["a"], ["b"]]
def _check_partial_fit_first_call(clf, classes=None):
"""Private helper function for factorizing common classes param logic.
Estimators that implement the ``partial_fit`` API need to be provided with
the list of possible classes at the first call to partial_fit.
Subsequent calls to partial_fit should check that ``classes`` is still
consistent with a previous value of ``clf.classes_`` when provided.
This function returns True if it detects that this was the first call to
``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
set on ``clf``.
"""
if getattr(clf, "classes_", None) is None and classes is None:
raise ValueError("classes must be passed on the first call to partial_fit.")
elif classes is not None:
if getattr(clf, "classes_", None) is not None:
if not np.array_equal(clf.classes_, unique_labels(classes)):
raise ValueError(
"`classes=%r` is not the same as on last call "
"to partial_fit, was: %r" % (classes, clf.classes_)
)
else:
# This is the first call to partial_fit
clf.classes_ = unique_labels(classes)
return True
# classes is None and clf.classes_ has already previously been set:
# nothing to do
return False
def class_distribution(y, sample_weight=None):
"""Compute class priors from multioutput-multiclass target data.
Parameters
----------
y : {array-like, sparse matrix} of size (n_samples, n_outputs)
The labels for each example.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
classes : list of size n_outputs of ndarray of size (n_classes,)
List of classes for each column.
n_classes : list of int of size n_outputs
Number of classes in each column.
class_prior : list of size n_outputs of ndarray of size (n_classes,)
Class distribution of each column.
"""
classes = []
n_classes = []
class_prior = []
n_samples, n_outputs = y.shape
if sample_weight is not None:
sample_weight = np.asarray(sample_weight)
if issparse(y):
y = y.tocsc()
y_nnz = np.diff(y.indptr)
for k in range(n_outputs):
col_nonzero = y.indices[y.indptr[k] : y.indptr[k + 1]]
# separate sample weights for zero and non-zero elements
if sample_weight is not None:
nz_samp_weight = sample_weight[col_nonzero]
zeros_samp_weight_sum = np.sum(sample_weight) - np.sum(nz_samp_weight)
else:
nz_samp_weight = None
zeros_samp_weight_sum = y.shape[0] - y_nnz[k]
classes_k, y_k = np.unique(
y.data[y.indptr[k] : y.indptr[k + 1]], return_inverse=True
)
class_prior_k = np.bincount(y_k, weights=nz_samp_weight)
# An explicit zero was found, combine its weight with the weight
# of the implicit zeros
if 0 in classes_k:
class_prior_k[classes_k == 0] += zeros_samp_weight_sum
# If an there is an implicit zero and it is not in classes and
# class_prior, make an entry for it
if 0 not in classes_k and y_nnz[k] < y.shape[0]:
classes_k = np.insert(classes_k, 0, 0)
class_prior_k = np.insert(class_prior_k, 0, zeros_samp_weight_sum)
classes.append(classes_k)
n_classes.append(classes_k.shape[0])
class_prior.append(class_prior_k / class_prior_k.sum())
else:
for k in range(n_outputs):
classes_k, y_k = np.unique(y[:, k], return_inverse=True)
classes.append(classes_k)
n_classes.append(classes_k.shape[0])
class_prior_k = np.bincount(y_k, weights=sample_weight)
class_prior.append(class_prior_k / class_prior_k.sum())
return (classes, n_classes, class_prior)
def _ovr_decision_function(predictions, confidences, n_classes):
"""Compute a continuous, tie-breaking OvR decision function from OvO.
It is important to include a continuous value, not only votes,
to make computing AUC or calibration meaningful.
Parameters
----------
predictions : array-like of shape (n_samples, n_classifiers)
Predicted classes for each binary classifier.
confidences : array-like of shape (n_samples, n_classifiers)
Decision functions or predicted probabilities for positive class
for each binary classifier.
n_classes : int
Number of classes. n_classifiers must be
``n_classes * (n_classes - 1 ) / 2``.
"""
n_samples = predictions.shape[0]
votes = np.zeros((n_samples, n_classes))
sum_of_confidences = np.zeros((n_samples, n_classes))
k = 0
for i in range(n_classes):
for j in range(i + 1, n_classes):
sum_of_confidences[:, i] -= confidences[:, k]
sum_of_confidences[:, j] += confidences[:, k]
votes[predictions[:, k] == 0, i] += 1
votes[predictions[:, k] == 1, j] += 1
k += 1
# Monotonically transform the sum_of_confidences to (-1/3, 1/3)
# and add it with votes. The monotonic transformation is
# f: x -> x / (3 * (|x| + 1)), it uses 1/3 instead of 1/2
# to ensure that we won't reach the limits and change vote order.
# The motivation is to use confidence levels as a way to break ties in
# the votes without switching any decision made based on a difference
# of 1 vote.
transformed_confidences = sum_of_confidences / (
3 * (np.abs(sum_of_confidences) + 1)
)
return votes + transformed_confidences
@@ -0,0 +1,21 @@
"""Export fast murmurhash C/C++ routines + cython wrappers"""
from ..utils._typedefs cimport int32_t, uint32_t
# The C API is disabled for now, since it requires -I flags to get
# compilation to work even when these functions are not used.
# cdef extern from "MurmurHash3.h":
# void MurmurHash3_x86_32(void* key, int len, unsigned int seed,
# void* out)
#
# void MurmurHash3_x86_128(void* key, int len, unsigned int seed,
# void* out)
#
# void MurmurHash3_x64_128(void* key, int len, unsigned int seed,
# void* out)
cpdef uint32_t murmurhash3_int_u32(int key, unsigned int seed)
cpdef int32_t murmurhash3_int_s32(int key, unsigned int seed)
cpdef uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed)
cpdef int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed)
@@ -0,0 +1,136 @@
"""Cython wrapper for MurmurHash3 non-cryptographic hash function.
MurmurHash is an extensively tested and very fast hash function that has
good distribution properties suitable for machine learning use cases
such as feature hashing and random projections.
The original C++ code by Austin Appleby is released the public domain
and can be found here:
https://code.google.com/p/smhasher/
"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#
# License: BSD 3 clause
from ..utils._typedefs cimport int32_t, uint32_t
import numpy as np
cdef extern from "src/MurmurHash3.h":
void MurmurHash3_x86_32(void *key, int len, uint32_t seed, void *out)
void MurmurHash3_x86_128(void *key, int len, uint32_t seed, void *out)
void MurmurHash3_x64_128 (void *key, int len, uint32_t seed, void *out)
cpdef uint32_t murmurhash3_int_u32(int key, unsigned int seed):
"""Compute the 32bit murmurhash3 of a int key at seed."""
cdef uint32_t out
MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
return out
cpdef int32_t murmurhash3_int_s32(int key, unsigned int seed):
"""Compute the 32bit murmurhash3 of a int key at seed."""
cdef int32_t out
MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
return out
cpdef uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed):
"""Compute the 32bit murmurhash3 of a bytes key at seed."""
cdef uint32_t out
MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
return out
cpdef int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed):
"""Compute the 32bit murmurhash3 of a bytes key at seed."""
cdef int32_t out
MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
return out
def _murmurhash3_bytes_array_u32(
const int32_t[:] key,
unsigned int seed,
):
"""Compute 32bit murmurhash3 hashes of a key int array at seed."""
# TODO make it possible to pass preallocated output array
cdef:
uint32_t[:] out = np.zeros(key.size, np.uint32)
Py_ssize_t i
for i in range(key.shape[0]):
out[i] = murmurhash3_int_u32(key[i], seed)
return np.asarray(out)
def _murmurhash3_bytes_array_s32(
const int32_t[:] key,
unsigned int seed,
):
"""Compute 32bit murmurhash3 hashes of a key int array at seed."""
# TODO make it possible to pass preallocated output array
cdef:
int32_t[:] out = np.zeros(key.size, np.int32)
Py_ssize_t i
for i in range(key.shape[0]):
out[i] = murmurhash3_int_s32(key[i], seed)
return np.asarray(out)
def murmurhash3_32(key, seed=0, positive=False):
"""Compute the 32bit murmurhash3 of key at seed.
The underlying implementation is MurmurHash3_x86_32 generating low
latency 32bits hash suitable for implementing lookup tables, Bloom
filters, count min sketch or feature hashing.
Parameters
----------
key : np.int32, bytes, unicode or ndarray of dtype=np.int32
The physical object to hash.
seed : int, default=0
Integer seed for the hashing algorithm.
positive : bool, default=False
True: the results is casted to an unsigned int
from 0 to 2 ** 32 - 1
False: the results is casted to a signed int
from -(2 ** 31) to 2 ** 31 - 1
Examples
--------
>>> from sklearn.utils import murmurhash3_32
>>> murmurhash3_32(b"Hello World!", seed=42)
3565178
"""
if isinstance(key, bytes):
if positive:
return murmurhash3_bytes_u32(key, seed)
else:
return murmurhash3_bytes_s32(key, seed)
elif isinstance(key, unicode):
if positive:
return murmurhash3_bytes_u32(key.encode('utf-8'), seed)
else:
return murmurhash3_bytes_s32(key.encode('utf-8'), seed)
elif isinstance(key, int) or isinstance(key, np.int32):
if positive:
return murmurhash3_int_u32(<int32_t>key, seed)
else:
return murmurhash3_int_s32(<int32_t>key, seed)
elif isinstance(key, np.ndarray):
if key.dtype != np.int32:
raise TypeError(
"key.dtype should be int32, got %s" % key.dtype)
if positive:
return _murmurhash3_bytes_array_u32(key.ravel(), seed).reshape(key.shape)
else:
return _murmurhash3_bytes_array_s32(key.ravel(), seed).reshape(key.shape)
else:
raise TypeError(
"key %r with type %s is not supported. "
"Explicit conversion to bytes is required" % (key, type(key)))
@@ -0,0 +1,379 @@
"""
Our own implementation of the Newton algorithm
Unlike the scipy.optimize version, this version of the Newton conjugate
gradient solver uses only one function call to retrieve the
func value, the gradient value and a callable for the Hessian matvec
product. If the function call is very expensive (e.g. for logistic
regression with large design matrix), this approach gives very
significant speedups.
"""
# This is a modified file from scipy.optimize
# Original authors: Travis Oliphant, Eric Jones
# Modifications by Gael Varoquaux, Mathieu Blondel and Tom Dupre la Tour
# License: BSD
import warnings
import numpy as np
import scipy
from ..exceptions import ConvergenceWarning
from .fixes import line_search_wolfe1, line_search_wolfe2
class _LineSearchError(RuntimeError):
pass
def _line_search_wolfe12(
f, fprime, xk, pk, gfk, old_fval, old_old_fval, verbose=0, **kwargs
):
"""
Same as line_search_wolfe1, but fall back to line_search_wolfe2 if
suitable step length is not found, and raise an exception if a
suitable step length is not found.
Raises
------
_LineSearchError
If no suitable step size is found.
"""
is_verbose = verbose >= 2
eps = 16 * np.finfo(np.asarray(old_fval).dtype).eps
if is_verbose:
print(" Line Search")
print(f" eps=16 * finfo.eps={eps}")
print(" try line search wolfe1")
ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs)
if is_verbose:
_not_ = "not " if ret[0] is None else ""
print(" wolfe1 line search was " + _not_ + "successful")
if ret[0] is None:
# Have a look at the line_search method of our NewtonSolver class. We borrow
# the logic from there
# Deal with relative loss differences around machine precision.
args = kwargs.get("args", tuple())
fval = f(xk + pk, *args)
tiny_loss = np.abs(old_fval * eps)
loss_improvement = fval - old_fval
check = np.abs(loss_improvement) <= tiny_loss
if is_verbose:
print(
" check loss |improvement| <= eps * |loss_old|:"
f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
)
if check:
# 2.1 Check sum of absolute gradients as alternative condition.
sum_abs_grad_old = scipy.linalg.norm(gfk, ord=1)
grad = fprime(xk + pk, *args)
sum_abs_grad = scipy.linalg.norm(grad, ord=1)
check = sum_abs_grad < sum_abs_grad_old
if is_verbose:
print(
" check sum(|gradient|) < sum(|gradient_old|): "
f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
)
if check:
ret = (
1.0, # step size
ret[1] + 1, # number of function evaluations
ret[2] + 1, # number of gradient evaluations
fval,
old_fval,
grad,
)
if ret[0] is None:
# line search failed: try different one.
# TODO: It seems that the new check for the sum of absolute gradients above
# catches all cases that, earlier, ended up here. In fact, our tests never
# trigger this "if branch" here and we can consider to remove it.
if is_verbose:
print(" last resort: try line search wolfe2")
ret = line_search_wolfe2(
f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs
)
if is_verbose:
_not_ = "not " if ret[0] is None else ""
print(" wolfe2 line search was " + _not_ + "successful")
if ret[0] is None:
raise _LineSearchError()
return ret
def _cg(fhess_p, fgrad, maxiter, tol, verbose=0):
"""
Solve iteratively the linear system 'fhess_p . xsupi = fgrad'
with a conjugate gradient descent.
Parameters
----------
fhess_p : callable
Function that takes the gradient as a parameter and returns the
matrix product of the Hessian and gradient.
fgrad : ndarray of shape (n_features,) or (n_features + 1,)
Gradient vector.
maxiter : int
Number of CG iterations.
tol : float
Stopping criterion.
Returns
-------
xsupi : ndarray of shape (n_features,) or (n_features + 1,)
Estimated solution.
"""
eps = 16 * np.finfo(np.float64).eps
xsupi = np.zeros(len(fgrad), dtype=fgrad.dtype)
ri = np.copy(fgrad) # residual = fgrad - fhess_p @ xsupi
psupi = -ri
i = 0
dri0 = np.dot(ri, ri)
# We also keep track of |p_i|^2.
psupi_norm2 = dri0
is_verbose = verbose >= 2
while i <= maxiter:
if np.sum(np.abs(ri)) <= tol:
if is_verbose:
print(
f" Inner CG solver iteration {i} stopped with\n"
f" sum(|residuals|) <= tol: {np.sum(np.abs(ri))} <= {tol}"
)
break
Ap = fhess_p(psupi)
# check curvature
curv = np.dot(psupi, Ap)
if 0 <= curv <= eps * psupi_norm2:
# See https://arxiv.org/abs/1803.02924, Algo 1 Capped Conjugate Gradient.
if is_verbose:
print(
f" Inner CG solver iteration {i} stopped with\n"
f" tiny_|p| = eps * ||p||^2, eps = {eps}, "
f"squred L2 norm ||p||^2 = {psupi_norm2}\n"
f" curvature <= tiny_|p|: {curv} <= {eps * psupi_norm2}"
)
break
elif curv < 0:
if i > 0:
if is_verbose:
print(
f" Inner CG solver iteration {i} stopped with negative "
f"curvature, curvature = {curv}"
)
break
else:
# fall back to steepest descent direction
xsupi += dri0 / curv * psupi
if is_verbose:
print(" Inner CG solver iteration 0 fell back to steepest descent")
break
alphai = dri0 / curv
xsupi += alphai * psupi
ri += alphai * Ap
dri1 = np.dot(ri, ri)
betai = dri1 / dri0
psupi = -ri + betai * psupi
# We use |p_i|^2 = |r_i|^2 + beta_i^2 |p_{i-1}|^2
psupi_norm2 = dri1 + betai**2 * psupi_norm2
i = i + 1
dri0 = dri1 # update np.dot(ri,ri) for next time.
if is_verbose and i > maxiter:
print(
f" Inner CG solver stopped reaching maxiter={i - 1} with "
f"sum(|residuals|) = {np.sum(np.abs(ri))}"
)
return xsupi
def _newton_cg(
grad_hess,
func,
grad,
x0,
args=(),
tol=1e-4,
maxiter=100,
maxinner=200,
line_search=True,
warn=True,
verbose=0,
):
"""
Minimization of scalar function of one or more variables using the
Newton-CG algorithm.
Parameters
----------
grad_hess : callable
Should return the gradient and a callable returning the matvec product
of the Hessian.
func : callable
Should return the value of the function.
grad : callable
Should return the function value and the gradient. This is used
by the linesearch functions.
x0 : array of float
Initial guess.
args : tuple, default=()
Arguments passed to func_grad_hess, func and grad.
tol : float, default=1e-4
Stopping criterion. The iteration will stop when
``max{|g_i | i = 1, ..., n} <= tol``
where ``g_i`` is the i-th component of the gradient.
maxiter : int, default=100
Number of Newton iterations.
maxinner : int, default=200
Number of CG iterations.
line_search : bool, default=True
Whether to use a line search or not.
warn : bool, default=True
Whether to warn when didn't converge.
Returns
-------
xk : ndarray of float
Estimated minimum.
"""
x0 = np.asarray(x0).flatten()
xk = np.copy(x0)
k = 0
if line_search:
old_fval = func(x0, *args)
old_old_fval = None
else:
old_fval = 0
is_verbose = verbose > 0
# Outer loop: our Newton iteration
while k < maxiter:
# Compute a search direction pk by applying the CG method to
# del2 f(xk) p = - fgrad f(xk) starting from 0.
fgrad, fhess_p = grad_hess(xk, *args)
absgrad = np.abs(fgrad)
max_absgrad = np.max(absgrad)
check = max_absgrad <= tol
if is_verbose:
print(f"Newton-CG iter = {k}")
print(" Check Convergence")
print(f" max |gradient| <= tol: {max_absgrad} <= {tol} {check}")
if check:
break
maggrad = np.sum(absgrad)
eta = min([0.5, np.sqrt(maggrad)])
termcond = eta * maggrad
# Inner loop: solve the Newton update by conjugate gradient, to
# avoid inverting the Hessian
xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond, verbose=verbose)
alphak = 1.0
if line_search:
try:
alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12(
func,
grad,
xk,
xsupi,
fgrad,
old_fval,
old_old_fval,
verbose=verbose,
args=args,
)
except _LineSearchError:
warnings.warn("Line Search failed")
break
xk += alphak * xsupi # upcast if necessary
k += 1
if warn and k >= maxiter:
warnings.warn(
(
f"newton-cg failed to converge at loss = {old_fval}. Increase the"
" number of iterations."
),
ConvergenceWarning,
)
elif is_verbose:
print(f" Solver did converge at loss = {old_fval}.")
return xk, k
def _check_optimize_result(solver, result, max_iter=None, extra_warning_msg=None):
"""Check the OptimizeResult for successful convergence
Parameters
----------
solver : str
Solver name. Currently only `lbfgs` is supported.
result : OptimizeResult
Result of the scipy.optimize.minimize function.
max_iter : int, default=None
Expected maximum number of iterations.
extra_warning_msg : str, default=None
Extra warning message.
Returns
-------
n_iter : int
Number of iterations.
"""
# handle both scipy and scikit-learn solver names
if solver == "lbfgs":
if result.status != 0:
try:
# The message is already decoded in scipy>=1.6.0
result_message = result.message.decode("latin1")
except AttributeError:
result_message = result.message
warning_msg = (
"{} failed to converge (status={}):\n{}.\n\n"
"Increase the number of iterations (max_iter) "
"or scale the data as shown in:\n"
" https://scikit-learn.org/stable/modules/"
"preprocessing.html"
).format(solver, result.status, result_message)
if extra_warning_msg is not None:
warning_msg += "\n" + extra_warning_msg
warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2)
if max_iter is not None:
# In scipy <= 1.0.0, nit may exceed maxiter for lbfgs.
# See https://github.com/scipy/scipy/issues/7854
n_iter_i = min(result.nit, max_iter)
else:
n_iter_i = result.nit
else:
raise NotImplementedError
return n_iter_i
@@ -0,0 +1,129 @@
"""
The :mod:`sklearn.utils.parallel` customizes `joblib` tools for scikit-learn usage.
"""
import functools
import warnings
from functools import update_wrapper
import joblib
from .._config import config_context, get_config
def _with_config(delayed_func, config):
"""Helper function that intends to attach a config to a delayed function."""
if hasattr(delayed_func, "with_config"):
return delayed_func.with_config(config)
else:
warnings.warn(
(
"`sklearn.utils.parallel.Parallel` needs to be used in "
"conjunction with `sklearn.utils.parallel.delayed` instead of "
"`joblib.delayed` to correctly propagate the scikit-learn "
"configuration to the joblib workers."
),
UserWarning,
)
return delayed_func
class Parallel(joblib.Parallel):
"""Tweak of :class:`joblib.Parallel` that propagates the scikit-learn configuration.
This subclass of :class:`joblib.Parallel` ensures that the active configuration
(thread-local) of scikit-learn is propagated to the parallel workers for the
duration of the execution of the parallel tasks.
The API does not change and you can refer to :class:`joblib.Parallel`
documentation for more details.
.. versionadded:: 1.3
"""
def __call__(self, iterable):
"""Dispatch the tasks and return the results.
Parameters
----------
iterable : iterable
Iterable containing tuples of (delayed_function, args, kwargs) that should
be consumed.
Returns
-------
results : list
List of results of the tasks.
"""
# Capture the thread-local scikit-learn configuration at the time
# Parallel.__call__ is issued since the tasks can be dispatched
# in a different thread depending on the backend and on the value of
# pre_dispatch and n_jobs.
config = get_config()
iterable_with_config = (
(_with_config(delayed_func, config), args, kwargs)
for delayed_func, args, kwargs in iterable
)
return super().__call__(iterable_with_config)
# remove when https://github.com/joblib/joblib/issues/1071 is fixed
def delayed(function):
"""Decorator used to capture the arguments of a function.
This alternative to `joblib.delayed` is meant to be used in conjunction
with `sklearn.utils.parallel.Parallel`. The latter captures the scikit-
learn configuration by calling `sklearn.get_config()` in the current
thread, prior to dispatching the first task. The captured configuration is
then propagated and enabled for the duration of the execution of the
delayed function in the joblib workers.
.. versionchanged:: 1.3
`delayed` was moved from `sklearn.utils.fixes` to `sklearn.utils.parallel`
in scikit-learn 1.3.
Parameters
----------
function : callable
The function to be delayed.
Returns
-------
output: tuple
Tuple containing the delayed function, the positional arguments, and the
keyword arguments.
"""
@functools.wraps(function)
def delayed_function(*args, **kwargs):
return _FuncWrapper(function), args, kwargs
return delayed_function
class _FuncWrapper:
"""Load the global configuration before calling the function."""
def __init__(self, function):
self.function = function
update_wrapper(self, self.function)
def with_config(self, config):
self.config = config
return self
def __call__(self, *args, **kwargs):
config = getattr(self, "config", None)
if config is None:
warnings.warn(
(
"`sklearn.utils.parallel.delayed` should be used with"
" `sklearn.utils.parallel.Parallel` to make it possible to"
" propagate the scikit-learn configuration of the current thread to"
" the joblib workers."
),
UserWarning,
)
config = {}
with config_context(**config):
return self.function(*args, **kwargs)
@@ -0,0 +1,103 @@
"""
The mod:`sklearn.utils.random` module includes utilities for random sampling.
"""
# Author: Hamzeh Alsalhi <ha258@cornell.edu>
#
# License: BSD 3 clause
import array
import numpy as np
import scipy.sparse as sp
from . import check_random_state
from ._random import sample_without_replacement
__all__ = ["sample_without_replacement"]
def _random_choice_csc(n_samples, classes, class_probability=None, random_state=None):
"""Generate a sparse random matrix given column class distributions
Parameters
----------
n_samples : int,
Number of samples to draw in each column.
classes : list of size n_outputs of arrays of size (n_classes,)
List of classes for each column.
class_probability : list of size n_outputs of arrays of \
shape (n_classes,), default=None
Class distribution of each column. If None, uniform distribution is
assumed.
random_state : int, RandomState instance or None, default=None
Controls the randomness of the sampled classes.
See :term:`Glossary <random_state>`.
Returns
-------
random_matrix : sparse csc matrix of size (n_samples, n_outputs)
"""
data = array.array("i")
indices = array.array("i")
indptr = array.array("i", [0])
for j in range(len(classes)):
classes[j] = np.asarray(classes[j])
if classes[j].dtype.kind != "i":
raise ValueError("class dtype %s is not supported" % classes[j].dtype)
classes[j] = classes[j].astype(np.int64, copy=False)
# use uniform distribution if no class_probability is given
if class_probability is None:
class_prob_j = np.empty(shape=classes[j].shape[0])
class_prob_j.fill(1 / classes[j].shape[0])
else:
class_prob_j = np.asarray(class_probability[j])
if not np.isclose(np.sum(class_prob_j), 1.0):
raise ValueError(
"Probability array at index {0} does not sum to one".format(j)
)
if class_prob_j.shape[0] != classes[j].shape[0]:
raise ValueError(
"classes[{0}] (length {1}) and "
"class_probability[{0}] (length {2}) have "
"different length.".format(
j, classes[j].shape[0], class_prob_j.shape[0]
)
)
# If 0 is not present in the classes insert it with a probability 0.0
if 0 not in classes[j]:
classes[j] = np.insert(classes[j], 0, 0)
class_prob_j = np.insert(class_prob_j, 0, 0.0)
# If there are nonzero classes choose randomly using class_probability
rng = check_random_state(random_state)
if classes[j].shape[0] > 1:
index_class_0 = np.flatnonzero(classes[j] == 0).item()
p_nonzero = 1 - class_prob_j[index_class_0]
nnz = int(n_samples * p_nonzero)
ind_sample = sample_without_replacement(
n_population=n_samples, n_samples=nnz, random_state=random_state
)
indices.extend(ind_sample)
# Normalize probabilities for the nonzero elements
classes_j_nonzero = classes[j] != 0
class_probability_nz = class_prob_j[classes_j_nonzero]
class_probability_nz_norm = class_probability_nz / np.sum(
class_probability_nz
)
classes_ind = np.searchsorted(
class_probability_nz_norm.cumsum(), rng.uniform(size=nnz)
)
data.extend(classes[j][classes_j_nonzero][classes_ind])
indptr.append(len(indices))
return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)
@@ -0,0 +1,745 @@
"""
The :mod:`sklearn.utils.sparsefuncs` module includes a collection of utilities to
work with sparse matrices and arrays.
"""
# Authors: Manoj Kumar
# Thomas Unterthiner
# Giorgio Patrini
#
# License: BSD 3 clause
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import LinearOperator
from ..utils.fixes import _sparse_min_max, _sparse_nan_min_max
from ..utils.validation import _check_sample_weight
from .sparsefuncs_fast import (
csc_mean_variance_axis0 as _csc_mean_var_axis0,
)
from .sparsefuncs_fast import (
csr_mean_variance_axis0 as _csr_mean_var_axis0,
)
from .sparsefuncs_fast import (
incr_mean_variance_axis0 as _incr_mean_var_axis0,
)
def _raise_typeerror(X):
"""Raises a TypeError if X is not a CSR or CSC matrix"""
input_type = X.format if sp.issparse(X) else type(X)
err = "Expected a CSR or CSC sparse matrix, got %s." % input_type
raise TypeError(err)
def _raise_error_wrong_axis(axis):
if axis not in (0, 1):
raise ValueError(
"Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis
)
def inplace_csr_column_scale(X, scale):
"""Inplace column scaling of a CSR matrix.
Scale each feature of the data matrix by multiplying with specific scale
provided by the caller assuming a (n_samples, n_features) shape.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Matrix to normalize using the variance of the features.
It should be of CSR format.
scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
Array of precomputed feature-wise values to use for scaling.
Examples
--------
>>> from sklearn.utils import sparsefuncs
>>> from scipy import sparse
>>> import numpy as np
>>> indptr = np.array([0, 3, 4, 4, 4])
>>> indices = np.array([0, 1, 2, 2])
>>> data = np.array([8, 1, 2, 5])
>>> scale = np.array([2, 3, 2])
>>> csr = sparse.csr_matrix((data, indices, indptr))
>>> csr.todense()
matrix([[8, 1, 2],
[0, 0, 5],
[0, 0, 0],
[0, 0, 0]])
>>> sparsefuncs.inplace_csr_column_scale(csr, scale)
>>> csr.todense()
matrix([[16, 3, 4],
[ 0, 0, 10],
[ 0, 0, 0],
[ 0, 0, 0]])
"""
assert scale.shape[0] == X.shape[1]
X.data *= scale.take(X.indices, mode="clip")
def inplace_csr_row_scale(X, scale):
"""Inplace row scaling of a CSR matrix.
Scale each sample of the data matrix by multiplying with specific scale
provided by the caller assuming a (n_samples, n_features) shape.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Matrix to be scaled. It should be of CSR format.
scale : ndarray of float of shape (n_samples,)
Array of precomputed sample-wise values to use for scaling.
"""
assert scale.shape[0] == X.shape[0]
X.data *= np.repeat(scale, np.diff(X.indptr))
def mean_variance_axis(X, axis, weights=None, return_sum_weights=False):
"""Compute mean and variance along an axis on a CSR or CSC matrix.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Input data. It can be of CSR or CSC format.
axis : {0, 1}
Axis along which the axis should be computed.
weights : ndarray of shape (n_samples,) or (n_features,), default=None
If axis is set to 0 shape is (n_samples,) or
if axis is set to 1 shape is (n_features,).
If it is set to None, then samples are equally weighted.
.. versionadded:: 0.24
return_sum_weights : bool, default=False
If True, returns the sum of weights seen for each feature
if `axis=0` or each sample if `axis=1`.
.. versionadded:: 0.24
Returns
-------
means : ndarray of shape (n_features,), dtype=floating
Feature-wise means.
variances : ndarray of shape (n_features,), dtype=floating
Feature-wise variances.
sum_weights : ndarray of shape (n_features,), dtype=floating
Returned if `return_sum_weights` is `True`.
Examples
--------
>>> from sklearn.utils import sparsefuncs
>>> from scipy import sparse
>>> import numpy as np
>>> indptr = np.array([0, 3, 4, 4, 4])
>>> indices = np.array([0, 1, 2, 2])
>>> data = np.array([8, 1, 2, 5])
>>> scale = np.array([2, 3, 2])
>>> csr = sparse.csr_matrix((data, indices, indptr))
>>> csr.todense()
matrix([[8, 1, 2],
[0, 0, 5],
[0, 0, 0],
[0, 0, 0]])
>>> sparsefuncs.mean_variance_axis(csr, axis=0)
(array([2. , 0.25, 1.75]), array([12. , 0.1875, 4.1875]))
"""
_raise_error_wrong_axis(axis)
if sp.issparse(X) and X.format == "csr":
if axis == 0:
return _csr_mean_var_axis0(
X, weights=weights, return_sum_weights=return_sum_weights
)
else:
return _csc_mean_var_axis0(
X.T, weights=weights, return_sum_weights=return_sum_weights
)
elif sp.issparse(X) and X.format == "csc":
if axis == 0:
return _csc_mean_var_axis0(
X, weights=weights, return_sum_weights=return_sum_weights
)
else:
return _csr_mean_var_axis0(
X.T, weights=weights, return_sum_weights=return_sum_weights
)
else:
_raise_typeerror(X)
def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None):
"""Compute incremental mean and variance along an axis on a CSR or CSC matrix.
last_mean, last_var are the statistics computed at the last step by this
function. Both must be initialized to 0-arrays of the proper size, i.e.
the number of features in X. last_n is the number of samples encountered
until now.
Parameters
----------
X : CSR or CSC sparse matrix of shape (n_samples, n_features)
Input data.
axis : {0, 1}
Axis along which the axis should be computed.
last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating
Array of means to update with the new data X.
Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating
Array of variances to update with the new data X.
Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
last_n : float or ndarray of shape (n_features,) or (n_samples,), \
dtype=floating
Sum of the weights seen so far, excluding the current weights
If not float, it should be of shape (n_features,) if
axis=0 or (n_samples,) if axis=1. If float it corresponds to
having same weights for all samples (or features).
weights : ndarray of shape (n_samples,) or (n_features,), default=None
If axis is set to 0 shape is (n_samples,) or
if axis is set to 1 shape is (n_features,).
If it is set to None, then samples are equally weighted.
.. versionadded:: 0.24
Returns
-------
means : ndarray of shape (n_features,) or (n_samples,), dtype=floating
Updated feature-wise means if axis = 0 or
sample-wise means if axis = 1.
variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating
Updated feature-wise variances if axis = 0 or
sample-wise variances if axis = 1.
n : ndarray of shape (n_features,) or (n_samples,), dtype=integral
Updated number of seen samples per feature if axis=0
or number of seen features per sample if axis=1.
If weights is not None, n is a sum of the weights of the seen
samples or features instead of the actual number of seen
samples or features.
Notes
-----
NaNs are ignored in the algorithm.
Examples
--------
>>> from sklearn.utils import sparsefuncs
>>> from scipy import sparse
>>> import numpy as np
>>> indptr = np.array([0, 3, 4, 4, 4])
>>> indices = np.array([0, 1, 2, 2])
>>> data = np.array([8, 1, 2, 5])
>>> scale = np.array([2, 3, 2])
>>> csr = sparse.csr_matrix((data, indices, indptr))
>>> csr.todense()
matrix([[8, 1, 2],
[0, 0, 5],
[0, 0, 0],
[0, 0, 0]])
>>> sparsefuncs.incr_mean_variance_axis(
... csr, axis=0, last_mean=np.zeros(3), last_var=np.zeros(3), last_n=2
... )
(array([1.3..., 0.1..., 1.1...]), array([8.8..., 0.1..., 3.4...]),
array([6., 6., 6.]))
"""
_raise_error_wrong_axis(axis)
if not (sp.issparse(X) and X.format in ("csc", "csr")):
_raise_typeerror(X)
if np.size(last_n) == 1:
last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype)
if not (np.size(last_mean) == np.size(last_var) == np.size(last_n)):
raise ValueError("last_mean, last_var, last_n do not have the same shapes.")
if axis == 1:
if np.size(last_mean) != X.shape[0]:
raise ValueError(
"If axis=1, then last_mean, last_n, last_var should be of "
f"size n_samples {X.shape[0]} (Got {np.size(last_mean)})."
)
else: # axis == 0
if np.size(last_mean) != X.shape[1]:
raise ValueError(
"If axis=0, then last_mean, last_n, last_var should be of "
f"size n_features {X.shape[1]} (Got {np.size(last_mean)})."
)
X = X.T if axis == 1 else X
if weights is not None:
weights = _check_sample_weight(weights, X, dtype=X.dtype)
return _incr_mean_var_axis0(
X, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights
)
def inplace_column_scale(X, scale):
"""Inplace column scaling of a CSC/CSR matrix.
Scale each feature of the data matrix by multiplying with specific scale
provided by the caller assuming a (n_samples, n_features) shape.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Matrix to normalize using the variance of the features. It should be
of CSC or CSR format.
scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
Array of precomputed feature-wise values to use for scaling.
Examples
--------
>>> from sklearn.utils import sparsefuncs
>>> from scipy import sparse
>>> import numpy as np
>>> indptr = np.array([0, 3, 4, 4, 4])
>>> indices = np.array([0, 1, 2, 2])
>>> data = np.array([8, 1, 2, 5])
>>> scale = np.array([2, 3, 2])
>>> csr = sparse.csr_matrix((data, indices, indptr))
>>> csr.todense()
matrix([[8, 1, 2],
[0, 0, 5],
[0, 0, 0],
[0, 0, 0]])
>>> sparsefuncs.inplace_column_scale(csr, scale)
>>> csr.todense()
matrix([[16, 3, 4],
[ 0, 0, 10],
[ 0, 0, 0],
[ 0, 0, 0]])
"""
if sp.issparse(X) and X.format == "csc":
inplace_csr_row_scale(X.T, scale)
elif sp.issparse(X) and X.format == "csr":
inplace_csr_column_scale(X, scale)
else:
_raise_typeerror(X)
def inplace_row_scale(X, scale):
"""Inplace row scaling of a CSR or CSC matrix.
Scale each row of the data matrix by multiplying with specific scale
provided by the caller assuming a (n_samples, n_features) shape.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Matrix to be scaled. It should be of CSR or CSC format.
scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
Array of precomputed sample-wise values to use for scaling.
Examples
--------
>>> from sklearn.utils import sparsefuncs
>>> from scipy import sparse
>>> import numpy as np
>>> indptr = np.array([0, 2, 3, 4, 5])
>>> indices = np.array([0, 1, 2, 3, 3])
>>> data = np.array([8, 1, 2, 5, 6])
>>> scale = np.array([2, 3, 4, 5])
>>> csr = sparse.csr_matrix((data, indices, indptr))
>>> csr.todense()
matrix([[8, 1, 0, 0],
[0, 0, 2, 0],
[0, 0, 0, 5],
[0, 0, 0, 6]])
>>> sparsefuncs.inplace_row_scale(csr, scale)
>>> csr.todense()
matrix([[16, 2, 0, 0],
[ 0, 0, 6, 0],
[ 0, 0, 0, 20],
[ 0, 0, 0, 30]])
"""
if sp.issparse(X) and X.format == "csc":
inplace_csr_column_scale(X.T, scale)
elif sp.issparse(X) and X.format == "csr":
inplace_csr_row_scale(X, scale)
else:
_raise_typeerror(X)
def inplace_swap_row_csc(X, m, n):
"""Swap two rows of a CSC matrix in-place.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Matrix whose two rows are to be swapped. It should be of
CSC format.
m : int
Index of the row of X to be swapped.
n : int
Index of the row of X to be swapped.
"""
for t in [m, n]:
if isinstance(t, np.ndarray):
raise TypeError("m and n should be valid integers")
if m < 0:
m += X.shape[0]
if n < 0:
n += X.shape[0]
m_mask = X.indices == m
X.indices[X.indices == n] = m
X.indices[m_mask] = n
def inplace_swap_row_csr(X, m, n):
"""Swap two rows of a CSR matrix in-place.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Matrix whose two rows are to be swapped. It should be of
CSR format.
m : int
Index of the row of X to be swapped.
n : int
Index of the row of X to be swapped.
"""
for t in [m, n]:
if isinstance(t, np.ndarray):
raise TypeError("m and n should be valid integers")
if m < 0:
m += X.shape[0]
if n < 0:
n += X.shape[0]
# The following swapping makes life easier since m is assumed to be the
# smaller integer below.
if m > n:
m, n = n, m
indptr = X.indptr
m_start = indptr[m]
m_stop = indptr[m + 1]
n_start = indptr[n]
n_stop = indptr[n + 1]
nz_m = m_stop - m_start
nz_n = n_stop - n_start
if nz_m != nz_n:
# Modify indptr first
X.indptr[m + 2 : n] += nz_n - nz_m
X.indptr[m + 1] = m_start + nz_n
X.indptr[n] = n_stop - nz_m
X.indices = np.concatenate(
[
X.indices[:m_start],
X.indices[n_start:n_stop],
X.indices[m_stop:n_start],
X.indices[m_start:m_stop],
X.indices[n_stop:],
]
)
X.data = np.concatenate(
[
X.data[:m_start],
X.data[n_start:n_stop],
X.data[m_stop:n_start],
X.data[m_start:m_stop],
X.data[n_stop:],
]
)
def inplace_swap_row(X, m, n):
"""
Swap two rows of a CSC/CSR matrix in-place.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Matrix whose two rows are to be swapped. It should be of CSR or
CSC format.
m : int
Index of the row of X to be swapped.
n : int
Index of the row of X to be swapped.
Examples
--------
>>> from sklearn.utils import sparsefuncs
>>> from scipy import sparse
>>> import numpy as np
>>> indptr = np.array([0, 2, 3, 3, 3])
>>> indices = np.array([0, 2, 2])
>>> data = np.array([8, 2, 5])
>>> csr = sparse.csr_matrix((data, indices, indptr))
>>> csr.todense()
matrix([[8, 0, 2],
[0, 0, 5],
[0, 0, 0],
[0, 0, 0]])
>>> sparsefuncs.inplace_swap_row(csr, 0, 1)
>>> csr.todense()
matrix([[0, 0, 5],
[8, 0, 2],
[0, 0, 0],
[0, 0, 0]])
"""
if sp.issparse(X) and X.format == "csc":
inplace_swap_row_csc(X, m, n)
elif sp.issparse(X) and X.format == "csr":
inplace_swap_row_csr(X, m, n)
else:
_raise_typeerror(X)
def inplace_swap_column(X, m, n):
"""
Swap two columns of a CSC/CSR matrix in-place.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Matrix whose two columns are to be swapped. It should be of
CSR or CSC format.
m : int
Index of the column of X to be swapped.
n : int
Index of the column of X to be swapped.
Examples
--------
>>> from sklearn.utils import sparsefuncs
>>> from scipy import sparse
>>> import numpy as np
>>> indptr = np.array([0, 2, 3, 3, 3])
>>> indices = np.array([0, 2, 2])
>>> data = np.array([8, 2, 5])
>>> csr = sparse.csr_matrix((data, indices, indptr))
>>> csr.todense()
matrix([[8, 0, 2],
[0, 0, 5],
[0, 0, 0],
[0, 0, 0]])
>>> sparsefuncs.inplace_swap_column(csr, 0, 1)
>>> csr.todense()
matrix([[0, 8, 2],
[0, 0, 5],
[0, 0, 0],
[0, 0, 0]])
"""
if m < 0:
m += X.shape[1]
if n < 0:
n += X.shape[1]
if sp.issparse(X) and X.format == "csc":
inplace_swap_row_csr(X, m, n)
elif sp.issparse(X) and X.format == "csr":
inplace_swap_row_csc(X, m, n)
else:
_raise_typeerror(X)
def min_max_axis(X, axis, ignore_nan=False):
"""Compute minimum and maximum along an axis on a CSR or CSC matrix.
Optionally ignore NaN values.
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Input data. It should be of CSR or CSC format.
axis : {0, 1}
Axis along which the axis should be computed.
ignore_nan : bool, default=False
Ignore or passing through NaN values.
.. versionadded:: 0.20
Returns
-------
mins : ndarray of shape (n_features,), dtype={np.float32, np.float64}
Feature-wise minima.
maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64}
Feature-wise maxima.
"""
if sp.issparse(X) and X.format in ("csr", "csc"):
if ignore_nan:
return _sparse_nan_min_max(X, axis=axis)
else:
return _sparse_min_max(X, axis=axis)
else:
_raise_typeerror(X)
def count_nonzero(X, axis=None, sample_weight=None):
"""A variant of X.getnnz() with extension to weighting on axis 0.
Useful in efficiently calculating multilabel metrics.
Parameters
----------
X : sparse matrix of shape (n_samples, n_labels)
Input data. It should be of CSR format.
axis : {0, 1}, default=None
The axis on which the data is aggregated.
sample_weight : array-like of shape (n_samples,), default=None
Weight for each row of X.
Returns
-------
nnz : int, float, ndarray of shape (n_samples,) or ndarray of shape (n_features,)
Number of non-zero values in the array along a given axis. Otherwise,
the total number of non-zero values in the array is returned.
"""
if axis == -1:
axis = 1
elif axis == -2:
axis = 0
elif X.format != "csr":
raise TypeError("Expected CSR sparse format, got {0}".format(X.format))
# We rely here on the fact that np.diff(Y.indptr) for a CSR
# will return the number of nonzero entries in each row.
# A bincount over Y.indices will return the number of nonzeros
# in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14.
if axis is None:
if sample_weight is None:
return X.nnz
else:
return np.dot(np.diff(X.indptr), sample_weight)
elif axis == 1:
out = np.diff(X.indptr)
if sample_weight is None:
# astype here is for consistency with axis=0 dtype
return out.astype("intp")
return out * sample_weight
elif axis == 0:
if sample_weight is None:
return np.bincount(X.indices, minlength=X.shape[1])
else:
weights = np.repeat(sample_weight, np.diff(X.indptr))
return np.bincount(X.indices, minlength=X.shape[1], weights=weights)
else:
raise ValueError("Unsupported axis: {0}".format(axis))
def _get_median(data, n_zeros):
"""Compute the median of data with n_zeros additional zeros.
This function is used to support sparse matrices; it modifies data
in-place.
"""
n_elems = len(data) + n_zeros
if not n_elems:
return np.nan
n_negative = np.count_nonzero(data < 0)
middle, is_odd = divmod(n_elems, 2)
data.sort()
if is_odd:
return _get_elem_at_rank(middle, data, n_negative, n_zeros)
return (
_get_elem_at_rank(middle - 1, data, n_negative, n_zeros)
+ _get_elem_at_rank(middle, data, n_negative, n_zeros)
) / 2.0
def _get_elem_at_rank(rank, data, n_negative, n_zeros):
"""Find the value in data augmented with n_zeros for the given rank"""
if rank < n_negative:
return data[rank]
if rank - n_negative < n_zeros:
return 0
return data[rank - n_zeros]
def csc_median_axis_0(X):
"""Find the median across axis 0 of a CSC matrix.
It is equivalent to doing np.median(X, axis=0).
Parameters
----------
X : sparse matrix of shape (n_samples, n_features)
Input data. It should be of CSC format.
Returns
-------
median : ndarray of shape (n_features,)
Median.
"""
if not (sp.issparse(X) and X.format == "csc"):
raise TypeError("Expected matrix of CSC format, got %s" % X.format)
indptr = X.indptr
n_samples, n_features = X.shape
median = np.zeros(n_features)
for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])):
# Prevent modifying X in place
data = np.copy(X.data[start:end])
nz = n_samples - data.size
median[f_ind] = _get_median(data, nz)
return median
def _implicit_column_offset(X, offset):
"""Create an implicitly offset linear operator.
This is used by PCA on sparse data to avoid densifying the whole data
matrix.
Params
------
X : sparse matrix of shape (n_samples, n_features)
offset : ndarray of shape (n_features,)
Returns
-------
centered : LinearOperator
"""
offset = offset[None, :]
XT = X.T
return LinearOperator(
matvec=lambda x: X @ x - offset @ x,
matmat=lambda x: X @ x - offset @ x,
rmatvec=lambda x: XT @ x - (offset * x.sum()),
rmatmat=lambda x: XT @ x - offset.T @ x.sum(axis=0)[None, :],
dtype=X.dtype,
shape=X.shape,
)
@@ -0,0 +1,640 @@
"""
The :mod:`sklearn.utils.sparsefuncs_fast` module includes a collection of utilities to
work with sparse matrices and arrays written in Cython.
"""
# Authors: Mathieu Blondel
# Olivier Grisel
# Peter Prettenhofer
# Lars Buitinck
# Giorgio Patrini
#
# License: BSD 3 clause
from libc.math cimport fabs, sqrt, isnan
from libc.stdint cimport intptr_t
import numpy as np
from cython cimport floating
from ..utils._typedefs cimport float64_t, int32_t, int64_t, intp_t, uint64_t
ctypedef fused integral:
int32_t
int64_t
def csr_row_norms(X):
"""Squared L2 norm of each row in CSR matrix X."""
if X.dtype not in [np.float32, np.float64]:
X = X.astype(np.float64)
return _sqeuclidean_row_norms_sparse(X.data, X.indptr)
def _sqeuclidean_row_norms_sparse(
const floating[::1] X_data,
const integral[::1] X_indptr,
):
cdef:
integral n_samples = X_indptr.shape[0] - 1
integral i, j
dtype = np.float32 if floating is float else np.float64
cdef floating[::1] squared_row_norms = np.zeros(n_samples, dtype=dtype)
with nogil:
for i in range(n_samples):
for j in range(X_indptr[i], X_indptr[i + 1]):
squared_row_norms[i] += X_data[j] * X_data[j]
return np.asarray(squared_row_norms)
def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False):
"""Compute mean and variance along axis 0 on a CSR matrix
Uses a np.float64 accumulator.
Parameters
----------
X : CSR sparse matrix, shape (n_samples, n_features)
Input data.
weights : ndarray of shape (n_samples,), dtype=floating, default=None
If it is set to None samples will be equally weighted.
.. versionadded:: 0.24
return_sum_weights : bool, default=False
If True, returns the sum of weights seen for each feature.
.. versionadded:: 0.24
Returns
-------
means : float array with shape (n_features,)
Feature-wise means
variances : float array with shape (n_features,)
Feature-wise variances
sum_weights : ndarray of shape (n_features,), dtype=floating
Returned if return_sum_weights is True.
"""
if X.dtype not in [np.float32, np.float64]:
X = X.astype(np.float64)
if weights is None:
weights = np.ones(X.shape[0], dtype=X.dtype)
means, variances, sum_weights = _csr_mean_variance_axis0(
X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)
if return_sum_weights:
return means, variances, sum_weights
return means, variances
def _csr_mean_variance_axis0(
const floating[::1] X_data,
uint64_t n_samples,
uint64_t n_features,
const integral[:] X_indices,
const integral[:] X_indptr,
const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
intp_t row_ind
uint64_t feature_idx
integral i, col_ind
float64_t diff
# means[j] contains the mean of feature j
float64_t[::1] means = np.zeros(n_features)
# variances[j] contains the variance of feature j
float64_t[::1] variances = np.zeros(n_features)
float64_t[::1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
)
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
float64_t[::1] correction = np.zeros(shape=n_features)
uint64_t[::1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
)
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)
for row_ind in range(len(X_indptr) - 1):
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
col_ind = X_indices[i]
if not isnan(X_data[i]):
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
# sum of weights where X[:, col_ind] is non-zero
sum_weights_nz[col_ind] += weights[row_ind]
# number of non-zero elements of X[:, col_ind]
counts_nz[col_ind] += 1
else:
# sum of weights where X[:, col_ind] is not nan
sum_weights[col_ind] -= weights[row_ind]
# number of non nan elements of X[:, col_ind]
counts[col_ind] -= 1
for feature_idx in range(n_features):
means[feature_idx] /= sum_weights[feature_idx]
for row_ind in range(len(X_indptr) - 1):
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
col_ind = X_indices[i]
if not isnan(X_data[i]):
diff = X_data[i] - means[col_ind]
# correction term of the corrected 2 pass algorithm.
# See "Algorithms for computing the sample variance: analysis
# and recommendations", by Chan, Golub, and LeVeque.
correction[col_ind] += diff * weights[row_ind]
variances[col_ind] += diff * diff * weights[row_ind]
for feature_idx in range(n_features):
if counts[feature_idx] != counts_nz[feature_idx]:
correction[feature_idx] -= (
sum_weights[feature_idx] - sum_weights_nz[feature_idx]
) * means[feature_idx]
correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx]
if counts[feature_idx] != counts_nz[feature_idx]:
# only compute it when it's guaranteed to be non-zero to avoid
# catastrophic cancellation.
variances[feature_idx] += (
sum_weights[feature_idx] - sum_weights_nz[feature_idx]
) * means[feature_idx]**2
variances[feature_idx] = (
(variances[feature_idx] - correction[feature_idx]) /
sum_weights[feature_idx]
)
if floating is float:
return (
np.array(means, dtype=np.float32),
np.array(variances, dtype=np.float32),
np.array(sum_weights, dtype=np.float32),
)
else:
return (
np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
)
def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):
"""Compute mean and variance along axis 0 on a CSC matrix
Uses a np.float64 accumulator.
Parameters
----------
X : CSC sparse matrix, shape (n_samples, n_features)
Input data.
weights : ndarray of shape (n_samples,), dtype=floating, default=None
If it is set to None samples will be equally weighted.
.. versionadded:: 0.24
return_sum_weights : bool, default=False
If True, returns the sum of weights seen for each feature.
.. versionadded:: 0.24
Returns
-------
means : float array with shape (n_features,)
Feature-wise means
variances : float array with shape (n_features,)
Feature-wise variances
sum_weights : ndarray of shape (n_features,), dtype=floating
Returned if return_sum_weights is True.
"""
if X.dtype not in [np.float32, np.float64]:
X = X.astype(np.float64)
if weights is None:
weights = np.ones(X.shape[0], dtype=X.dtype)
means, variances, sum_weights = _csc_mean_variance_axis0(
X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)
if return_sum_weights:
return means, variances, sum_weights
return means, variances
def _csc_mean_variance_axis0(
const floating[::1] X_data,
uint64_t n_samples,
uint64_t n_features,
const integral[:] X_indices,
const integral[:] X_indptr,
const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
integral i, row_ind
uint64_t feature_idx, col_ind
float64_t diff
# means[j] contains the mean of feature j
float64_t[::1] means = np.zeros(n_features)
# variances[j] contains the variance of feature j
float64_t[::1] variances = np.zeros(n_features)
float64_t[::1] sum_weights = np.full(
fill_value=np.sum(weights, dtype=np.float64), shape=n_features
)
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
float64_t[::1] correction = np.zeros(shape=n_features)
uint64_t[::1] counts = np.full(
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
)
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)
for col_ind in range(n_features):
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
row_ind = X_indices[i]
if not isnan(X_data[i]):
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
# sum of weights where X[:, col_ind] is non-zero
sum_weights_nz[col_ind] += weights[row_ind]
# number of non-zero elements of X[:, col_ind]
counts_nz[col_ind] += 1
else:
# sum of weights where X[:, col_ind] is not nan
sum_weights[col_ind] -= weights[row_ind]
# number of non nan elements of X[:, col_ind]
counts[col_ind] -= 1
for feature_idx in range(n_features):
means[feature_idx] /= sum_weights[feature_idx]
for col_ind in range(n_features):
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
row_ind = X_indices[i]
if not isnan(X_data[i]):
diff = X_data[i] - means[col_ind]
# correction term of the corrected 2 pass algorithm.
# See "Algorithms for computing the sample variance: analysis
# and recommendations", by Chan, Golub, and LeVeque.
correction[col_ind] += diff * weights[row_ind]
variances[col_ind] += diff * diff * weights[row_ind]
for feature_idx in range(n_features):
if counts[feature_idx] != counts_nz[feature_idx]:
correction[feature_idx] -= (
sum_weights[feature_idx] - sum_weights_nz[feature_idx]
) * means[feature_idx]
correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx]
if counts[feature_idx] != counts_nz[feature_idx]:
# only compute it when it's guaranteed to be non-zero to avoid
# catastrophic cancellation.
variances[feature_idx] += (
sum_weights[feature_idx] - sum_weights_nz[feature_idx]
) * means[feature_idx]**2
variances[feature_idx] = (
(variances[feature_idx] - correction[feature_idx])
) / sum_weights[feature_idx]
if floating is float:
return (np.array(means, dtype=np.float32),
np.array(variances, dtype=np.float32),
np.array(sum_weights, dtype=np.float32))
else:
return (
np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
)
def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
"""Compute mean and variance along axis 0 on a CSR or CSC matrix.
last_mean, last_var are the statistics computed at the last step by this
function. Both must be initialized to 0.0. last_n is the
number of samples encountered until now and is initialized at 0.
Parameters
----------
X : CSR or CSC sparse matrix, shape (n_samples, n_features)
Input data.
last_mean : float array with shape (n_features,)
Array of feature-wise means to update with the new data X.
last_var : float array with shape (n_features,)
Array of feature-wise var to update with the new data X.
last_n : float array with shape (n_features,)
Sum of the weights seen so far (if weights are all set to 1
this will be the same as number of samples seen so far, before X).
weights : float array with shape (n_samples,) or None. If it is set
to None samples will be equally weighted.
Returns
-------
updated_mean : float array with shape (n_features,)
Feature-wise means
updated_variance : float array with shape (n_features,)
Feature-wise variances
updated_n : int array with shape (n_features,)
Updated number of samples seen
Notes
-----
NaNs are ignored during the computation.
References
----------
T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample
variance: recommendations, The American Statistician, Vol. 37, No. 3,
pp. 242-247
Also, see the non-sparse implementation of this in
`utils.extmath._batch_mean_variance_update`.
"""
if X.dtype not in [np.float32, np.float64]:
X = X.astype(np.float64)
X_dtype = X.dtype
if weights is None:
weights = np.ones(X.shape[0], dtype=X_dtype)
elif weights.dtype not in [np.float32, np.float64]:
weights = weights.astype(np.float64, copy=False)
if last_n.dtype not in [np.float32, np.float64]:
last_n = last_n.astype(np.float64, copy=False)
return _incr_mean_variance_axis0(X.data,
np.sum(weights),
X.shape[1],
X.indices,
X.indptr,
X.format,
last_mean.astype(X_dtype, copy=False),
last_var.astype(X_dtype, copy=False),
last_n.astype(X_dtype, copy=False),
weights.astype(X_dtype, copy=False))
def _incr_mean_variance_axis0(
const floating[:] X_data,
floating n_samples,
uint64_t n_features,
const int[:] X_indices,
# X_indptr might be either int32 or int64
const integral[:] X_indptr,
str X_format,
floating[:] last_mean,
floating[:] last_var,
floating[:] last_n,
# previous sum of the weights (ie float)
const floating[:] weights,
):
# Implement the function here since variables using fused types
# cannot be declared directly and can only be passed as function arguments
cdef:
uint64_t i
# last = stats until now
# new = the current increment
# updated = the aggregated stats
# when arrays, they are indexed by i per-feature
floating[::1] new_mean
floating[::1] new_var
floating[::1] updated_mean
floating[::1] updated_var
if floating is float:
dtype = np.float32
else:
dtype = np.float64
new_mean = np.zeros(n_features, dtype=dtype)
new_var = np.zeros_like(new_mean, dtype=dtype)
updated_mean = np.zeros_like(new_mean, dtype=dtype)
updated_var = np.zeros_like(new_mean, dtype=dtype)
cdef:
floating[::1] new_n
floating[::1] updated_n
floating[::1] last_over_new_n
# Obtain new stats first
updated_n = np.zeros(shape=n_features, dtype=dtype)
last_over_new_n = np.zeros_like(updated_n, dtype=dtype)
# X can be a CSR or CSC matrix
if X_format == 'csr':
new_mean, new_var, new_n = _csr_mean_variance_axis0(
X_data, n_samples, n_features, X_indices, X_indptr, weights)
else: # X_format == 'csc'
new_mean, new_var, new_n = _csc_mean_variance_axis0(
X_data, n_samples, n_features, X_indices, X_indptr, weights)
# First pass
cdef bint is_first_pass = True
for i in range(n_features):
if last_n[i] > 0:
is_first_pass = False
break
if is_first_pass:
return np.asarray(new_mean), np.asarray(new_var), np.asarray(new_n)
for i in range(n_features):
updated_n[i] = last_n[i] + new_n[i]
# Next passes
for i in range(n_features):
if new_n[i] > 0:
last_over_new_n[i] = dtype(last_n[i]) / dtype(new_n[i])
# Unnormalized stats
last_mean[i] *= last_n[i]
last_var[i] *= last_n[i]
new_mean[i] *= new_n[i]
new_var[i] *= new_n[i]
# Update stats
updated_var[i] = (
last_var[i] + new_var[i] +
last_over_new_n[i] / updated_n[i] *
(last_mean[i] / last_over_new_n[i] - new_mean[i])**2
)
updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n[i]
updated_var[i] /= updated_n[i]
else:
updated_var[i] = last_var[i]
updated_mean[i] = last_mean[i]
updated_n[i] = last_n[i]
return (
np.asarray(updated_mean),
np.asarray(updated_var),
np.asarray(updated_n),
)
def inplace_csr_row_normalize_l1(X):
"""Normalize inplace the rows of a CSR matrix or array by their L1 norm.
Parameters
----------
X : scipy.sparse.csr_matrix and scipy.sparse.csr_array, \
shape=(n_samples, n_features)
The input matrix or array to be modified inplace.
Examples
--------
>>> from scipy.sparse import csr_matrix
>>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
>>> X = csr_matrix(([1.0, 2.0, 3.0], [0, 2, 3], [0, 3, 4]), shape=(3, 4))
>>> X.toarray()
array([[1., 2., 0., 0.],
[0., 0., 3., 0.],
[0., 0., 0., 4.]])
>>> inplace_csr_row_normalize_l1(X)
>>> X.toarray()
array([[0.33... , 0.66... , 0. , 0. ],
[0. , 0. , 1. , 0. ],
[0. , 0. , 0. , 1. ]])
"""
_inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr)
def _inplace_csr_row_normalize_l1(
floating[:] X_data,
shape,
const integral[:] X_indices,
const integral[:] X_indptr,
):
cdef:
uint64_t n_samples = shape[0]
# the column indices for row i are stored in:
# indices[indptr[i]:indices[i+1]]
# and their corresponding values are stored in:
# data[indptr[i]:indptr[i+1]]
uint64_t i
integral j
double sum_
for i in range(n_samples):
sum_ = 0.0
for j in range(X_indptr[i], X_indptr[i + 1]):
sum_ += fabs(X_data[j])
if sum_ == 0.0:
# do not normalize empty rows (can happen if CSR is not pruned
# correctly)
continue
for j in range(X_indptr[i], X_indptr[i + 1]):
X_data[j] /= sum_
def inplace_csr_row_normalize_l2(X):
"""Normalize inplace the rows of a CSR matrix or array by their L2 norm.
Parameters
----------
X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
The input matrix or array to be modified inplace.
Examples
--------
>>> from scipy.sparse import csr_matrix
>>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
>>> X = csr_matrix(([1.0, 2.0, 3.0], [0, 2, 3], [0, 3, 4]), shape=(3, 4))
>>> X.toarray()
array([[1., 2., 0., 0.],
[0., 0., 3., 0.],
[0., 0., 0., 4.]])
>>> inplace_csr_row_normalize_l2(X)
>>> X.toarray()
array([[0.44... , 0.89... , 0. , 0. ],
[0. , 0. , 1. , 0. ],
[0. , 0. , 0. , 1. ]])
"""
_inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr)
def _inplace_csr_row_normalize_l2(
floating[:] X_data,
shape,
const integral[:] X_indices,
const integral[:] X_indptr,
):
cdef:
uint64_t n_samples = shape[0]
uint64_t i
integral j
double sum_
for i in range(n_samples):
sum_ = 0.0
for j in range(X_indptr[i], X_indptr[i + 1]):
sum_ += (X_data[j] * X_data[j])
if sum_ == 0.0:
# do not normalize empty rows (can happen if CSR is not pruned
# correctly)
continue
sum_ = sqrt(sum_)
for j in range(X_indptr[i], X_indptr[i + 1]):
X_data[j] /= sum_
def assign_rows_csr(
X,
const intptr_t[:] X_rows,
const intptr_t[:] out_rows,
floating[:, ::1] out,
):
"""Densify selected rows of a CSR matrix into a preallocated array.
Like out[out_rows] = X[X_rows].toarray() but without copying.
No-copy supported for both dtype=np.float32 and dtype=np.float64.
Parameters
----------
X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
X_rows : array, dtype=np.intp, shape=n_rows
out_rows : array, dtype=np.intp, shape=n_rows
out : array, shape=(arbitrary, n_features)
"""
cdef:
# intptr_t (npy_intp, np.intp in Python) is what np.where returns,
# but int is what scipy.sparse uses.
intp_t i, ind, j, k
intptr_t rX
const floating[:] data = X.data
const int32_t[:] indices = X.indices
const int32_t[:] indptr = X.indptr
if X_rows.shape[0] != out_rows.shape[0]:
raise ValueError("cannot assign %d rows to %d"
% (X_rows.shape[0], out_rows.shape[0]))
with nogil:
for k in range(out_rows.shape[0]):
out[out_rows[k]] = 0.0
for i in range(X_rows.shape[0]):
rX = X_rows[i]
for ind in range(indptr[rX], indptr[rX + 1]):
j = indices[ind]
out[out_rows[i], j] = data[ind]
@@ -0,0 +1,346 @@
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
// Note - The x86 and x64 versions do _not_ produce the same results, as the
// algorithms are optimized for their respective platforms. You can still
// compile and run any of them on any platform, but your performance with the
// non-native version will be less than optimal.
#include "MurmurHash3.h"
//-----------------------------------------------------------------------------
// Platform-specific functions and macros
// Microsoft Visual Studio
#if defined(_MSC_VER)
#define FORCE_INLINE __forceinline
#include <stdlib.h>
#define ROTL32(x,y) _rotl(x,y)
#define ROTL64(x,y) _rotl64(x,y)
#define BIG_CONSTANT(x) (x)
// Other compilers
#else // defined(_MSC_VER)
#if defined(GNUC) && ((GNUC > 4) || (GNUC == 4 && GNUC_MINOR >= 4))
/* gcc version >= 4.4 4.1 = RHEL 5, 4.4 = RHEL 6.
* Don't inline for RHEL 5 gcc which is 4.1 */
#define FORCE_INLINE attribute((always_inline))
#else
#define FORCE_INLINE
#endif
inline uint32_t rotl32 ( uint32_t x, int8_t r )
{
return (x << r) | (x >> (32 - r));
}
inline uint64_t rotl64 ( uint64_t x, int8_t r )
{
return (x << r) | (x >> (64 - r));
}
#define ROTL32(x,y) rotl32(x,y)
#define ROTL64(x,y) rotl64(x,y)
#define BIG_CONSTANT(x) (x##LLU)
#endif // !defined(_MSC_VER)
//-----------------------------------------------------------------------------
// Block read - if your platform needs to do endian-swapping or can only
// handle aligned reads, do the conversion here
FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
{
return p[i];
}
FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
{
return p[i];
}
//-----------------------------------------------------------------------------
// Finalization mix - force all bits of a hash block to avalanche
FORCE_INLINE uint32_t fmix ( uint32_t h )
{
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
//----------
FORCE_INLINE uint64_t fmix ( uint64_t k )
{
k ^= k >> 33;
k *= BIG_CONSTANT(0xff51afd7ed558ccd);
k ^= k >> 33;
k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
k ^= k >> 33;
return k;
}
//-----------------------------------------------------------------------------
void MurmurHash3_x86_32 ( const void * key, int len,
uint32_t seed, void * out )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 4;
uint32_t h1 = seed;
uint32_t c1 = 0xcc9e2d51;
uint32_t c2 = 0x1b873593;
//----------
// body
const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
for(int i = -nblocks; i; i++)
{
uint32_t k1 = getblock(blocks,i);
k1 *= c1;
k1 = ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
uint32_t k1 = 0;
switch(len & 3)
{
case 3: k1 ^= tail[2] << 16;
case 2: k1 ^= tail[1] << 8;
case 1: k1 ^= tail[0];
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
}
//----------
// finalization
h1 ^= len;
h1 = fmix(h1);
*(uint32_t*)out = h1;
}
//-----------------------------------------------------------------------------
void MurmurHash3_x86_128 ( const void * key, const int len,
uint32_t seed, void * out )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;
uint32_t h1 = seed;
uint32_t h2 = seed;
uint32_t h3 = seed;
uint32_t h4 = seed;
uint32_t c1 = 0x239b961b;
uint32_t c2 = 0xab0e9789;
uint32_t c3 = 0x38b34ae5;
uint32_t c4 = 0xa1e38b93;
//----------
// body
const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
for(int i = -nblocks; i; i++)
{
uint32_t k1 = getblock(blocks,i*4+0);
uint32_t k2 = getblock(blocks,i*4+1);
uint32_t k3 = getblock(blocks,i*4+2);
uint32_t k4 = getblock(blocks,i*4+3);
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
uint32_t k1 = 0;
uint32_t k2 = 0;
uint32_t k3 = 0;
uint32_t k4 = 0;
switch(len & 15)
{
case 15: k4 ^= tail[14] << 16;
case 14: k4 ^= tail[13] << 8;
case 13: k4 ^= tail[12] << 0;
k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
case 12: k3 ^= tail[11] << 24;
case 11: k3 ^= tail[10] << 16;
case 10: k3 ^= tail[ 9] << 8;
case 9: k3 ^= tail[ 8] << 0;
k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
case 8: k2 ^= tail[ 7] << 24;
case 7: k2 ^= tail[ 6] << 16;
case 6: k2 ^= tail[ 5] << 8;
case 5: k2 ^= tail[ 4] << 0;
k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
case 4: k1 ^= tail[ 3] << 24;
case 3: k1 ^= tail[ 2] << 16;
case 2: k1 ^= tail[ 1] << 8;
case 1: k1 ^= tail[ 0] << 0;
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
}
//----------
// finalization
h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
h1 += h2; h1 += h3; h1 += h4;
h2 += h1; h3 += h1; h4 += h1;
h1 = fmix(h1);
h2 = fmix(h2);
h3 = fmix(h3);
h4 = fmix(h4);
h1 += h2; h1 += h3; h1 += h4;
h2 += h1; h3 += h1; h4 += h1;
((uint32_t*)out)[0] = h1;
((uint32_t*)out)[1] = h2;
((uint32_t*)out)[2] = h3;
((uint32_t*)out)[3] = h4;
}
//-----------------------------------------------------------------------------
void MurmurHash3_x64_128 ( const void * key, const int len,
const uint32_t seed, void * out )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 16;
uint64_t h1 = seed;
uint64_t h2 = seed;
uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
//----------
// body
const uint64_t * blocks = (const uint64_t *)(data);
for(int i = 0; i < nblocks; i++)
{
uint64_t k1 = getblock(blocks,i*2+0);
uint64_t k2 = getblock(blocks,i*2+1);
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
uint64_t k1 = 0;
uint64_t k2 = 0;
switch(len & 15)
{
case 15: k2 ^= uint64_t(tail[14]) << 48;
case 14: k2 ^= uint64_t(tail[13]) << 40;
case 13: k2 ^= uint64_t(tail[12]) << 32;
case 12: k2 ^= uint64_t(tail[11]) << 24;
case 11: k2 ^= uint64_t(tail[10]) << 16;
case 10: k2 ^= uint64_t(tail[ 9]) << 8;
case 9: k2 ^= uint64_t(tail[ 8]) << 0;
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
case 8: k1 ^= uint64_t(tail[ 7]) << 56;
case 7: k1 ^= uint64_t(tail[ 6]) << 48;
case 6: k1 ^= uint64_t(tail[ 5]) << 40;
case 5: k1 ^= uint64_t(tail[ 4]) << 32;
case 4: k1 ^= uint64_t(tail[ 3]) << 24;
case 3: k1 ^= uint64_t(tail[ 2]) << 16;
case 2: k1 ^= uint64_t(tail[ 1]) << 8;
case 1: k1 ^= uint64_t(tail[ 0]) << 0;
k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
}
//----------
// finalization
h1 ^= len; h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix(h1);
h2 = fmix(h2);
h1 += h2;
h2 += h1;
((uint64_t*)out)[0] = h1;
((uint64_t*)out)[1] = h2;
}
//-----------------------------------------------------------------------------
@@ -0,0 +1,45 @@
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
#ifndef _MURMURHASH3_H_
#define _MURMURHASH3_H_
//-----------------------------------------------------------------------------
// Platform-specific functions and macros
// Microsoft Visual Studio
#if defined(_MSC_VER)
typedef unsigned char uint8_t;
typedef unsigned long uint32_t;
typedef unsigned __int64 uint64_t;
// Other compilers
#else // defined(_MSC_VER)
#include <stdint.h>
#endif // !defined(_MSC_VER)
//-----------------------------------------------------------------------------
#ifdef __cplusplus
extern "C" {
#endif
void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out );
void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
#ifdef __cplusplus
}
#endif
//-----------------------------------------------------------------------------
#endif // _MURMURHASH3_H_
@@ -0,0 +1,69 @@
import numpy as np
from .extmath import stable_cumsum
def _weighted_percentile(array, sample_weight, percentile=50):
"""Compute weighted percentile
Computes lower weighted percentile. If `array` is a 2D array, the
`percentile` is computed along the axis 0.
.. versionchanged:: 0.24
Accepts 2D `array`.
Parameters
----------
array : 1D or 2D array
Values to take the weighted percentile of.
sample_weight: 1D or 2D array
Weights for each value in `array`. Must be same shape as `array` or
of shape `(array.shape[0],)`.
percentile: int or float, default=50
Percentile to compute. Must be value between 0 and 100.
Returns
-------
percentile : int if `array` 1D, ndarray if `array` 2D
Weighted percentile.
"""
n_dim = array.ndim
if n_dim == 0:
return array[()]
if array.ndim == 1:
array = array.reshape((-1, 1))
# When sample_weight 1D, repeat for each array.shape[1]
if array.shape != sample_weight.shape and array.shape[0] == sample_weight.shape[0]:
sample_weight = np.tile(sample_weight, (array.shape[1], 1)).T
sorted_idx = np.argsort(array, axis=0)
sorted_weights = np.take_along_axis(sample_weight, sorted_idx, axis=0)
# Find index of median prediction for each sample
weight_cdf = stable_cumsum(sorted_weights, axis=0)
adjusted_percentile = percentile / 100 * weight_cdf[-1]
# For percentile=0, ignore leading observations with sample_weight=0. GH20528
mask = adjusted_percentile == 0
adjusted_percentile[mask] = np.nextafter(
adjusted_percentile[mask], adjusted_percentile[mask] + 1
)
percentile_idx = np.array(
[
np.searchsorted(weight_cdf[:, i], adjusted_percentile[i])
for i in range(weight_cdf.shape[1])
]
)
percentile_idx = np.array(percentile_idx)
# In rare cases, percentile_idx equals to sorted_idx.shape[0]
max_idx = sorted_idx.shape[0] - 1
percentile_idx = np.apply_along_axis(
lambda x: np.clip(x, 0, max_idx), axis=0, arr=percentile_idx
)
col_index = np.arange(array.shape[1])
percentile_in_sorted = sorted_idx[percentile_idx, col_index]
percentile = array[percentile_in_sorted, col_index]
return percentile[0] if n_dim == 1 else percentile
@@ -0,0 +1,16 @@
import pytest
from numpy.testing import assert_allclose
from sklearn.utils import check_random_state
from sklearn.utils._arpack import _init_arpack_v0
@pytest.mark.parametrize("seed", range(100))
def test_init_arpack_v0(seed):
# check that the initialization a sampling from an uniform distribution
# where we can fix the random state
size = 1000
v0 = _init_arpack_v0(size, seed)
rng = check_random_state(seed)
assert_allclose(v0, rng.uniform(-1, 1, size=size))
@@ -0,0 +1,506 @@
import re
from functools import partial
import numpy
import pytest
from numpy.testing import assert_allclose
from sklearn._config import config_context
from sklearn.base import BaseEstimator
from sklearn.utils._array_api import (
_ArrayAPIWrapper,
_asarray_with_order,
_atol_for_type,
_average,
_convert_to_numpy,
_estimator_with_converted_arrays,
_is_numpy_namespace,
_nanmax,
_nanmin,
_NumPyAPIWrapper,
_ravel,
device,
get_namespace,
indexing_dtype,
supported_float_dtypes,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import (
_array_api_for_tests,
skip_if_array_api_compat_not_configured,
)
from sklearn.utils.fixes import _IS_32BIT
@pytest.mark.parametrize("X", [numpy.asarray([1, 2, 3]), [1, 2, 3]])
def test_get_namespace_ndarray_default(X):
"""Check that get_namespace returns NumPy wrapper"""
xp_out, is_array_api_compliant = get_namespace(X)
assert isinstance(xp_out, _NumPyAPIWrapper)
assert not is_array_api_compliant
def test_get_namespace_ndarray_creation_device():
"""Check expected behavior with device and creation functions."""
X = numpy.asarray([1, 2, 3])
xp_out, _ = get_namespace(X)
full_array = xp_out.full(10, fill_value=2.0, device="cpu")
assert_allclose(full_array, [2.0] * 10)
with pytest.raises(ValueError, match="Unsupported device"):
xp_out.zeros(10, device="cuda")
@skip_if_array_api_compat_not_configured
def test_get_namespace_ndarray_with_dispatch():
"""Test get_namespace on NumPy ndarrays."""
array_api_compat = pytest.importorskip("array_api_compat")
X_np = numpy.asarray([[1, 2, 3]])
with config_context(array_api_dispatch=True):
xp_out, is_array_api_compliant = get_namespace(X_np)
assert is_array_api_compliant
assert xp_out is array_api_compat.numpy
@skip_if_array_api_compat_not_configured
def test_get_namespace_array_api():
"""Test get_namespace for ArrayAPI arrays."""
xp = pytest.importorskip("array_api_strict")
X_np = numpy.asarray([[1, 2, 3]])
X_xp = xp.asarray(X_np)
with config_context(array_api_dispatch=True):
xp_out, is_array_api_compliant = get_namespace(X_xp)
assert is_array_api_compliant
with pytest.raises(TypeError):
xp_out, is_array_api_compliant = get_namespace(X_xp, X_np)
class _AdjustableNameAPITestWrapper(_ArrayAPIWrapper):
"""API wrapper that has an adjustable name. Used for testing."""
def __init__(self, array_namespace, name):
super().__init__(array_namespace=array_namespace)
self.__name__ = name
def test_array_api_wrapper_astype():
"""Test _ArrayAPIWrapper for ArrayAPIs that is not NumPy."""
array_api_strict = pytest.importorskip("array_api_strict")
xp_ = _AdjustableNameAPITestWrapper(array_api_strict, "array_api_strict")
xp = _ArrayAPIWrapper(xp_)
X = xp.asarray(([[1, 2, 3], [3, 4, 5]]), dtype=xp.float64)
X_converted = xp.astype(X, xp.float32)
assert X_converted.dtype == xp.float32
X_converted = xp.asarray(X, dtype=xp.float32)
assert X_converted.dtype == xp.float32
@pytest.mark.parametrize("array_api", ["numpy", "array_api_strict"])
def test_asarray_with_order(array_api):
"""Test _asarray_with_order passes along order for NumPy arrays."""
xp = pytest.importorskip(array_api)
X = xp.asarray([1.2, 3.4, 5.1])
X_new = _asarray_with_order(X, order="F", xp=xp)
X_new_np = numpy.asarray(X_new)
assert X_new_np.flags["F_CONTIGUOUS"]
def test_asarray_with_order_ignored():
"""Test _asarray_with_order ignores order for Generic ArrayAPI."""
xp = pytest.importorskip("array_api_strict")
xp_ = _AdjustableNameAPITestWrapper(xp, "array_api_strict")
X = numpy.asarray([[1.2, 3.4, 5.1], [3.4, 5.5, 1.2]], order="C")
X = xp_.asarray(X)
X_new = _asarray_with_order(X, order="F", xp=xp_)
X_new_np = numpy.asarray(X_new)
assert X_new_np.flags["C_CONTIGUOUS"]
assert not X_new_np.flags["F_CONTIGUOUS"]
@pytest.mark.parametrize(
"array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize(
"weights, axis, normalize, expected",
[
# normalize = True
(None, None, True, 3.5),
(None, 0, True, [2.5, 3.5, 4.5]),
(None, 1, True, [2, 5]),
([True, False], 0, True, [1, 2, 3]), # boolean weights
([True, True, False], 1, True, [1.5, 4.5]), # boolean weights
([0.4, 0.1], 0, True, [1.6, 2.6, 3.6]),
([0.4, 0.2, 0.2], 1, True, [1.75, 4.75]),
([1, 2], 0, True, [3, 4, 5]),
([1, 1, 2], 1, True, [2.25, 5.25]),
([[1, 2, 3], [1, 2, 3]], 0, True, [2.5, 3.5, 4.5]),
([[1, 2, 1], [2, 2, 2]], 1, True, [2, 5]),
# normalize = False
(None, None, False, 21),
(None, 0, False, [5, 7, 9]),
(None, 1, False, [6, 15]),
([True, False], 0, False, [1, 2, 3]), # boolean weights
([True, True, False], 1, False, [3, 9]), # boolean weights
([0.4, 0.1], 0, False, [0.8, 1.3, 1.8]),
([0.4, 0.2, 0.2], 1, False, [1.4, 3.8]),
([1, 2], 0, False, [9, 12, 15]),
([1, 1, 2], 1, False, [9, 21]),
([[1, 2, 3], [1, 2, 3]], 0, False, [5, 14, 27]),
([[1, 2, 1], [2, 2, 2]], 1, False, [8, 30]),
],
)
def test_average(
array_namespace, device, dtype_name, weights, axis, normalize, expected
):
xp = _array_api_for_tests(array_namespace, device)
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
array_in = xp.asarray(array_in, device=device)
if weights is not None:
weights = numpy.asarray(weights, dtype=dtype_name)
weights = xp.asarray(weights, device=device)
with config_context(array_api_dispatch=True):
result = _average(array_in, axis=axis, weights=weights, normalize=normalize)
assert getattr(array_in, "device", None) == getattr(result, "device", None)
result = _convert_to_numpy(result, xp)
assert_allclose(result, expected, atol=_atol_for_type(dtype_name))
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(include_numpy_namespaces=False),
)
def test_average_raises_with_wrong_dtype(array_namespace, device, dtype_name):
xp = _array_api_for_tests(array_namespace, device)
array_in = numpy.asarray([2, 0], dtype=dtype_name) + 1j * numpy.asarray(
[4, 3], dtype=dtype_name
)
complex_type_name = array_in.dtype.name
if not hasattr(xp, complex_type_name):
# This is the case for cupy as of March 2024 for instance.
pytest.skip(f"{array_namespace} does not support {complex_type_name}")
array_in = xp.asarray(array_in, device=device)
err_msg = "Complex floating point values are not supported by average."
with (
config_context(array_api_dispatch=True),
pytest.raises(NotImplementedError, match=err_msg),
):
_average(array_in)
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(include_numpy_namespaces=True),
)
@pytest.mark.parametrize(
"axis, weights, error, error_msg",
(
(
None,
[1, 2],
TypeError,
"Axis must be specified",
),
(
0,
[[1, 2]],
TypeError,
"1D weights expected",
),
(
0,
[1, 2, 3, 4],
ValueError,
"Length of weights",
),
(0, [-1, 1], ZeroDivisionError, "Weights sum to zero, can't be normalized"),
),
)
def test_average_raises_with_invalid_parameters(
array_namespace, device, dtype_name, axis, weights, error, error_msg
):
xp = _array_api_for_tests(array_namespace, device)
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
array_in = xp.asarray(array_in, device=device)
weights = numpy.asarray(weights, dtype=dtype_name)
weights = xp.asarray(weights, device=device)
with config_context(array_api_dispatch=True), pytest.raises(error, match=error_msg):
_average(array_in, axis=axis, weights=weights)
def test_device_raises_if_no_input():
err_msg = re.escape(
"At least one input array expected after filtering with remove_none=True, "
"remove_types=[str]. Got none. Original types: []."
)
with pytest.raises(ValueError, match=err_msg):
device()
err_msg = re.escape(
"At least one input array expected after filtering with remove_none=True, "
"remove_types=[str]. Got none. Original types: [NoneType, str]."
)
with pytest.raises(ValueError, match=err_msg):
device(None, "name")
def test_device_inspection():
class Device:
def __init__(self, name):
self.name = name
def __eq__(self, device):
return self.name == device.name
def __hash__(self):
raise TypeError("Device object is not hashable")
def __str__(self):
return self.name
class Array:
def __init__(self, device_name):
self.device = Device(device_name)
# Sanity check: ensure our Device mock class is non hashable, to
# accurately account for non-hashable device objects in some array
# libraries, because of which the `device` inspection function should'nt
# make use of hash lookup tables (in particular, not use `set`)
with pytest.raises(TypeError):
hash(Array("device").device)
# Test raise if on different devices
err_msg = "Input arrays use different devices: cpu, mygpu"
with pytest.raises(ValueError, match=err_msg):
device(Array("cpu"), Array("mygpu"))
# Test expected value is returned otherwise
array1 = Array("device")
array2 = Array("device")
assert array1.device == device(array1)
assert array1.device == device(array1, array2)
assert array1.device == device(array1, array1, array2)
# TODO: add cupy and cupy.array_api to the list of libraries once the
# the following upstream issue has been fixed:
# https://github.com/cupy/cupy/issues/8180
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize("library", ["numpy", "array_api_strict", "torch"])
@pytest.mark.parametrize(
"X,reduction,expected",
[
([1, 2, numpy.nan], _nanmin, 1),
([1, -2, -numpy.nan], _nanmin, -2),
([numpy.inf, numpy.inf], _nanmin, numpy.inf),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmin, axis=0),
[1.0, 2.0, 3.0],
),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmin, axis=1),
[1.0, numpy.nan, 4.0],
),
([1, 2, numpy.nan], _nanmax, 2),
([1, 2, numpy.nan], _nanmax, 2),
([-numpy.inf, -numpy.inf], _nanmax, -numpy.inf),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmax, axis=0),
[4.0, 5.0, 6.0],
),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmax, axis=1),
[3.0, numpy.nan, 6.0],
),
],
)
def test_nan_reductions(library, X, reduction, expected):
"""Check NaN reductions like _nanmin and _nanmax"""
xp = pytest.importorskip(library)
with config_context(array_api_dispatch=True):
result = reduction(xp.asarray(X))
result = _convert_to_numpy(result, xp)
assert_allclose(result, expected)
@pytest.mark.parametrize(
"namespace, _device, _dtype", yield_namespace_device_dtype_combinations()
)
def test_ravel(namespace, _device, _dtype):
xp = _array_api_for_tests(namespace, _device)
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
array_xp = xp.asarray(array, device=_device)
with config_context(array_api_dispatch=True):
result = _ravel(array_xp)
result = _convert_to_numpy(result, xp)
expected = numpy.ravel(array, order="C")
assert_allclose(expected, result)
if _is_numpy_namespace(xp):
assert numpy.asarray(result).flags["C_CONTIGUOUS"]
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize("library", ["cupy", "torch", "cupy.array_api"])
def test_convert_to_numpy_gpu(library): # pragma: nocover
"""Check convert_to_numpy for GPU backed libraries."""
xp = pytest.importorskip(library)
if library == "torch":
if not xp.backends.cuda.is_built():
pytest.skip("test requires cuda")
X_gpu = xp.asarray([1.0, 2.0, 3.0], device="cuda")
else:
X_gpu = xp.asarray([1.0, 2.0, 3.0])
X_cpu = _convert_to_numpy(X_gpu, xp=xp)
expected_output = numpy.asarray([1.0, 2.0, 3.0])
assert_allclose(X_cpu, expected_output)
def test_convert_to_numpy_cpu():
"""Check convert_to_numpy for PyTorch CPU arrays."""
torch = pytest.importorskip("torch")
X_torch = torch.asarray([1.0, 2.0, 3.0], device="cpu")
X_cpu = _convert_to_numpy(X_torch, xp=torch)
expected_output = numpy.asarray([1.0, 2.0, 3.0])
assert_allclose(X_cpu, expected_output)
class SimpleEstimator(BaseEstimator):
def fit(self, X, y=None):
self.X_ = X
self.n_features_ = X.shape[0]
return self
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize(
"array_namespace, converter",
[
("torch", lambda array: array.cpu().numpy()),
("array_api_strict", lambda array: numpy.asarray(array)),
("cupy.array_api", lambda array: array._array.get()),
],
)
def test_convert_estimator_to_ndarray(array_namespace, converter):
"""Convert estimator attributes to ndarray."""
xp = pytest.importorskip(array_namespace)
X = xp.asarray([[1.3, 4.5]])
est = SimpleEstimator().fit(X)
new_est = _estimator_with_converted_arrays(est, converter)
assert isinstance(new_est.X_, numpy.ndarray)
@skip_if_array_api_compat_not_configured
def test_convert_estimator_to_array_api():
"""Convert estimator attributes to ArrayAPI arrays."""
xp = pytest.importorskip("array_api_strict")
X_np = numpy.asarray([[1.3, 4.5]])
est = SimpleEstimator().fit(X_np)
new_est = _estimator_with_converted_arrays(est, lambda array: xp.asarray(array))
assert hasattr(new_est.X_, "__array_namespace__")
def test_reshape_behavior():
"""Check reshape behavior with copy and is strict with non-tuple shape."""
xp = _NumPyAPIWrapper()
X = xp.asarray([[1, 2, 3], [3, 4, 5]])
X_no_copy = xp.reshape(X, (-1,), copy=False)
assert X_no_copy.base is X
X_copy = xp.reshape(X, (6, 1), copy=True)
assert X_copy.base is not X.base
with pytest.raises(TypeError, match="shape must be a tuple"):
xp.reshape(X, -1)
@pytest.mark.parametrize("wrapper", [_ArrayAPIWrapper, _NumPyAPIWrapper])
def test_get_namespace_array_api_isdtype(wrapper):
"""Test isdtype implementation from _ArrayAPIWrapper and _NumPyAPIWrapper."""
if wrapper == _ArrayAPIWrapper:
xp_ = pytest.importorskip("array_api_strict")
xp = _ArrayAPIWrapper(xp_)
else:
xp = _NumPyAPIWrapper()
assert xp.isdtype(xp.float32, xp.float32)
assert xp.isdtype(xp.float32, "real floating")
assert xp.isdtype(xp.float64, "real floating")
assert not xp.isdtype(xp.int32, "real floating")
for dtype in supported_float_dtypes(xp):
assert xp.isdtype(dtype, "real floating")
assert xp.isdtype(xp.bool, "bool")
assert not xp.isdtype(xp.float32, "bool")
assert xp.isdtype(xp.int16, "signed integer")
assert not xp.isdtype(xp.uint32, "signed integer")
assert xp.isdtype(xp.uint16, "unsigned integer")
assert not xp.isdtype(xp.int64, "unsigned integer")
assert xp.isdtype(xp.int64, "numeric")
assert xp.isdtype(xp.float32, "numeric")
assert xp.isdtype(xp.uint32, "numeric")
assert not xp.isdtype(xp.float32, "complex floating")
if wrapper == _NumPyAPIWrapper:
assert not xp.isdtype(xp.int8, "complex floating")
assert xp.isdtype(xp.complex64, "complex floating")
assert xp.isdtype(xp.complex128, "complex floating")
with pytest.raises(ValueError, match="Unrecognized data type"):
assert xp.isdtype(xp.int16, "unknown")
@pytest.mark.parametrize(
"namespace, _device, _dtype", yield_namespace_device_dtype_combinations()
)
def test_indexing_dtype(namespace, _device, _dtype):
xp = _array_api_for_tests(namespace, _device)
if _IS_32BIT:
assert indexing_dtype(xp) == xp.int32
else:
assert indexing_dtype(xp) == xp.int64
@@ -0,0 +1,40 @@
import numpy as np
import pytest
from sklearn.utils._testing import assert_allclose
from sklearn.utils.arrayfuncs import _all_with_any_reduction_axis_1, min_pos
def test_min_pos():
# Check that min_pos returns a positive value and that it's consistent
# between float and double
X = np.random.RandomState(0).randn(100)
min_double = min_pos(X)
min_float = min_pos(X.astype(np.float32))
assert_allclose(min_double, min_float)
assert min_double >= 0
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_min_pos_no_positive(dtype):
# Check that the return value of min_pos is the maximum representable
# value of the input dtype when all input elements are <= 0 (#19328)
X = np.full(100, -1.0).astype(dtype, copy=False)
assert min_pos(X) == np.finfo(dtype).max
@pytest.mark.parametrize(
"dtype", [np.int16, np.int32, np.int64, np.float32, np.float64]
)
@pytest.mark.parametrize("value", [0, 1.5, -1])
def test_all_with_any_reduction_axis_1(dtype, value):
# Check that return value is False when there is no row equal to `value`
X = np.arange(12, dtype=dtype).reshape(3, 4)
assert not _all_with_any_reduction_axis_1(X, value=value)
# Make a row equal to `value`
X[1, :] = value
assert _all_with_any_reduction_axis_1(X, value=value)
@@ -0,0 +1,32 @@
import warnings
import numpy as np
import pytest
from sklearn.utils import Bunch
def test_bunch_attribute_deprecation():
"""Check that bunch raises deprecation message with `__getattr__`."""
bunch = Bunch()
values = np.asarray([1, 2, 3])
msg = (
"Key: 'values', is deprecated in 1.3 and will be "
"removed in 1.5. Please use 'grid_values' instead"
)
bunch._set_deprecated(
values, new_key="grid_values", deprecated_key="values", warning_message=msg
)
with warnings.catch_warnings():
# Does not warn for "grid_values"
warnings.simplefilter("error")
v = bunch["grid_values"]
assert v is values
with pytest.warns(FutureWarning, match=msg):
# Warns for "values"
v = bunch["values"]
assert v is values
@@ -0,0 +1,73 @@
import warnings
from itertools import chain
import pytest
from sklearn import config_context
from sklearn.utils._chunking import gen_even_slices, get_chunk_n_rows
from sklearn.utils._testing import assert_array_equal
def test_gen_even_slices():
# check that gen_even_slices contains all samples
some_range = range(10)
joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)]))
assert_array_equal(some_range, joined_range)
@pytest.mark.parametrize(
("row_bytes", "max_n_rows", "working_memory", "expected"),
[
(1024, None, 1, 1024),
(1024, None, 0.99999999, 1023),
(1023, None, 1, 1025),
(1025, None, 1, 1023),
(1024, None, 2, 2048),
(1024, 7, 1, 7),
(1024 * 1024, None, 1, 1),
],
)
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected):
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
actual = get_chunk_n_rows(
row_bytes=row_bytes,
max_n_rows=max_n_rows,
working_memory=working_memory,
)
assert actual == expected
assert type(actual) is type(expected)
with config_context(working_memory=working_memory):
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
assert actual == expected
assert type(actual) is type(expected)
def test_get_chunk_n_rows_warns():
"""Check that warning is raised when working_memory is too low."""
row_bytes = 1024 * 1024 + 1
max_n_rows = None
working_memory = 1
expected = 1
warn_msg = (
"Could not adhere to working_memory config. Currently 1MiB, 2MiB required."
)
with pytest.warns(UserWarning, match=warn_msg):
actual = get_chunk_n_rows(
row_bytes=row_bytes,
max_n_rows=max_n_rows,
working_memory=working_memory,
)
assert actual == expected
assert type(actual) is type(expected)
with config_context(working_memory=working_memory):
with pytest.warns(UserWarning, match=warn_msg):
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
assert actual == expected
assert type(actual) is type(expected)
@@ -0,0 +1,316 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.utils.fixes import CSC_CONTAINERS
def test_compute_class_weight():
# Test (and demo) compute_class_weight.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
cw = compute_class_weight("balanced", classes=classes, y=y)
# total effect of samples is preserved
class_counts = np.bincount(y)[2:]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert cw[0] < cw[1] < cw[2]
@pytest.mark.parametrize(
"y_type, class_weight, classes, err_msg",
[
(
"numeric",
"balanced",
np.arange(4),
"classes should have valid labels that are in y",
),
# Non-regression for https://github.com/scikit-learn/scikit-learn/issues/8312
(
"numeric",
{"label_not_present": 1.0},
np.arange(4),
r"The classes, \[0, 1, 2, 3\], are not in class_weight",
),
(
"numeric",
"balanced",
np.arange(2),
"classes should include all valid labels",
),
(
"numeric",
{0: 1.0, 1: 2.0},
np.arange(2),
"classes should include all valid labels",
),
(
"string",
{"dogs": 3, "cat": 2},
np.array(["dog", "cat"]),
r"The classes, \['dog'\], are not in class_weight",
),
],
)
def test_compute_class_weight_not_present(y_type, class_weight, classes, err_msg):
# Raise error when y does not contain all class labels
y = (
np.asarray([0, 0, 0, 1, 1, 2])
if y_type == "numeric"
else np.asarray(["dog", "cat", "dog"])
)
print(y)
with pytest.raises(ValueError, match=err_msg):
compute_class_weight(class_weight, classes=classes, y=y)
def test_compute_class_weight_dict():
classes = np.arange(3)
class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
y = np.asarray([0, 0, 1, 2])
cw = compute_class_weight(class_weights, classes=classes, y=y)
# When the user specifies class weights, compute_class_weights should just
# return them.
assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
# When a class weight is specified that isn't in classes, the weight is ignored
class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
cw = compute_class_weight(class_weights, classes=classes, y=y)
assert_allclose([1.0, 2.0, 3.0], cw)
class_weights = {-1: 5.0, 0: 4.0, 1: 2.0, 2: 3.0}
cw = compute_class_weight(class_weights, classes=classes, y=y)
assert_allclose([4.0, 2.0, 3.0], cw)
def test_compute_class_weight_invariance():
# Test that results with class_weight="balanced" is invariant wrt
# class imbalance if the number of samples is identical.
# The test uses a balanced two class dataset with 100 datapoints.
# It creates three versions, one where class 1 is duplicated
# resulting in 150 points of class 1 and 50 of class 0,
# one where there are 50 points in class 1 and 150 in class 0,
# and one where there are 100 points of each class (this one is balanced
# again).
# With balancing class weights, all three should give the same model.
X, y = make_blobs(centers=2, random_state=0)
# create dataset where class 1 is duplicated twice
X_1 = np.vstack([X] + [X[y == 1]] * 2)
y_1 = np.hstack([y] + [y[y == 1]] * 2)
# create dataset where class 0 is duplicated twice
X_0 = np.vstack([X] + [X[y == 0]] * 2)
y_0 = np.hstack([y] + [y[y == 0]] * 2)
# duplicate everything
X_ = np.vstack([X] * 2)
y_ = np.hstack([y] * 2)
# results should be identical
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
def test_compute_class_weight_balanced_negative():
# Test compute_class_weight when labels are negative
# Test with balanced class labels.
classes = np.array([-2, -1, 0])
y = np.asarray([-1, -1, 0, 0, -2, -2])
cw = compute_class_weight("balanced", classes=classes, y=y)
assert len(cw) == len(classes)
assert_array_almost_equal(cw, np.array([1.0, 1.0, 1.0]))
# Test with unbalanced class labels.
y = np.asarray([-1, 0, 0, -2, -2, -2])
cw = compute_class_weight("balanced", classes=classes, y=y)
assert len(cw) == len(classes)
class_counts = np.bincount(y + 2)
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert_array_almost_equal(cw, [2.0 / 3, 2.0, 1.0])
def test_compute_class_weight_balanced_unordered():
# Test compute_class_weight when classes are unordered
classes = np.array([1, 0, 3])
y = np.asarray([1, 0, 0, 3, 3, 3])
cw = compute_class_weight("balanced", classes=classes, y=y)
class_counts = np.bincount(y)[classes]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert_array_almost_equal(cw, [2.0, 1.0, 2.0 / 3])
def test_compute_class_weight_default():
# Test for the case where no weight is given for a present class.
# Current behaviour is to assign the unweighted classes a weight of 1.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
classes_len = len(classes)
# Test for non specified weights
cw = compute_class_weight(None, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, np.ones(3))
# Tests for partly specified weights
cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1.0, 1.0])
cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1.0, 0.5])
def test_compute_sample_weight():
# Test (and demo) compute_sample_weight.
# Test with balanced classes
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with user-defined weights
sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 1.0, 1.0, 1.0])
# Test with column vector of balanced classes
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with unbalanced classes
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y)
expected_balanced = np.array(
[0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333]
)
assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
# Test with `None` weights
sample_weight = compute_sample_weight(None, y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with multi-output of balanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with multi-output with user-defined weights
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
# Test with multi-output of unbalanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, expected_balanced**2, decimal=3)
def test_compute_sample_weight_with_subsample():
# Test compute_sample_weight with subsamples specified.
# Test with balanced classes and all samples present
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with column vector of balanced classes and all samples present
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with a subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
assert_array_almost_equal(sample_weight, [2.0 / 3, 2.0 / 3, 2.0 / 3, 2.0, 2.0, 2.0])
# Test with a bootstrap subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3.0, 3.0, 3.0])
assert_array_almost_equal(sample_weight, expected_balanced)
# Test with a bootstrap subsample for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
assert_array_almost_equal(sample_weight, expected_balanced**2)
# Test with a missing class
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
# Test with a missing class for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
@pytest.mark.parametrize(
"y_type, class_weight, indices, err_msg",
[
(
"single-output",
{1: 2, 2: 1},
range(4),
"The only valid class_weight for subsampling is 'balanced'.",
),
(
"multi-output",
{1: 2, 2: 1},
None,
"For multi-output, class_weight should be a list of dicts, or the string",
),
(
"multi-output",
[{1: 2, 2: 1}],
None,
r"Got 1 element\(s\) while having 2 outputs",
),
],
)
def test_compute_sample_weight_errors(y_type, class_weight, indices, err_msg):
# Test compute_sample_weight raises errors expected.
# Invalid preset string
y_single_output = np.asarray([1, 1, 1, 2, 2, 2])
y_multi_output = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
y = y_single_output if y_type == "single-output" else y_multi_output
with pytest.raises(ValueError, match=err_msg):
compute_sample_weight(class_weight, y, indices=indices)
def test_compute_sample_weight_more_than_32():
# Non-regression smoke test for #12146
y = np.arange(50) # more than 32 distinct classes
indices = np.arange(50) # use subsampling
weight = compute_sample_weight("balanced", y, indices=indices)
assert_array_almost_equal(weight, np.ones(y.shape[0]))
def test_class_weight_does_not_contains_more_classes():
"""Check that class_weight can contain more labels than in y.
Non-regression test for #22413
"""
tree = DecisionTreeClassifier(class_weight={0: 1, 1: 10, 2: 20})
# Does not raise
tree.fit([[0, 0, 1], [1, 0, 1], [1, 2, 0]], [0, 0, 1])
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_compute_sample_weight_sparse(csc_container):
"""Check that we can compute weight for sparse `y`."""
y = csc_container(np.asarray([[0], [1], [1]]))
sample_weight = compute_sample_weight("balanced", y)
assert_allclose(sample_weight, [1.5, 0.75, 0.75])
@@ -0,0 +1,234 @@
import numpy as np
import pytest
from sklearn.utils._cython_blas import (
ColMajor,
NoTrans,
RowMajor,
Trans,
_asum_memview,
_axpy_memview,
_copy_memview,
_dot_memview,
_gemm_memview,
_gemv_memview,
_ger_memview,
_nrm2_memview,
_rot_memview,
_rotg_memview,
_scal_memview,
)
from sklearn.utils._testing import assert_allclose
def _numpy_to_cython(dtype):
cython = pytest.importorskip("cython")
if dtype == np.float32:
return cython.float
elif dtype == np.float64:
return cython.double
RTOL = {np.float32: 1e-6, np.float64: 1e-12}
ORDER = {RowMajor: "C", ColMajor: "F"}
def _no_op(x):
return x
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_dot(dtype):
dot = _dot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
expected = x.dot(y)
actual = dot(x, y)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_asum(dtype):
asum = _asum_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.abs(x).sum()
actual = asum(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_axpy(dtype):
axpy = _axpy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x + y
axpy(alpha, x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_nrm2(dtype):
nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.linalg.norm(x)
actual = nrm2(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_copy(dtype):
copy = _copy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = np.empty_like(x)
expected = x.copy()
copy(x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_scal(dtype):
scal = _scal_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x
scal(alpha, x)
assert_allclose(x, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rotg(dtype):
rotg = _rotg_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
a = dtype(rng.randn())
b = dtype(rng.randn())
c, s = 0.0, 0.0
def expected_rotg(a, b):
roe = a if abs(a) > abs(b) else b
if a == 0 and b == 0:
c, s, r, z = (1, 0, 0, 0)
else:
r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
c, s = a / r, b / r
z = s if roe == a else (1 if c == 0 else 1 / c)
return r, z, c, s
expected = expected_rotg(a, b)
actual = rotg(a, b, c, s)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rot(dtype):
rot = _rot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
c = dtype(rng.randn())
s = dtype(rng.randn())
expected_x = c * x + s * y
expected_y = c * y - s * x
rot(x, y, c, s)
assert_allclose(x, expected_x)
assert_allclose(y, expected_y)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize(
"opA, transA", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
)
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
def test_gemv(dtype, opA, transA, order):
gemv = _gemv_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(
opA(rng.random_sample((20, 10)).astype(dtype, copy=False)), order=ORDER[order]
)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(x) + beta * y
gemv(transA, alpha, A, x, beta, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
def test_ger(dtype, order):
ger = _ger_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
A = np.asarray(
rng.random_sample((10, 20)).astype(dtype, copy=False), order=ORDER[order]
)
alpha = 2.5
expected = alpha * np.outer(x, y) + A
ger(alpha, x, y, A)
assert_allclose(A, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize(
"opB, transB", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
)
@pytest.mark.parametrize(
"opA, transA", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
)
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
def test_gemm(dtype, opA, transA, opB, transB, order):
gemm = _gemm_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(
opA(rng.random_sample((30, 10)).astype(dtype, copy=False)), order=ORDER[order]
)
B = np.asarray(
opB(rng.random_sample((10, 20)).astype(dtype, copy=False)), order=ORDER[order]
)
C = np.asarray(
rng.random_sample((30, 20)).astype(dtype, copy=False), order=ORDER[order]
)
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(opB(B)) + beta * C
gemm(transA, transB, alpha, A, B, beta, C)
assert_allclose(C, expected, rtol=RTOL[dtype])
@@ -0,0 +1,22 @@
import pathlib
import pytest
import sklearn
def test_files_generated_by_templates_are_git_ignored():
"""Check the consistence of the files generated from template files."""
gitignore_file = pathlib.Path(sklearn.__file__).parent.parent / ".gitignore"
if not gitignore_file.exists():
pytest.skip("Tests are not run from the source folder")
base_dir = pathlib.Path(sklearn.__file__).parent
ignored_files = gitignore_file.read_text().split("\n")
ignored_files = [pathlib.Path(line) for line in ignored_files]
for filename in base_dir.glob("**/*.tp"):
filename = filename.relative_to(base_dir.parent)
# From "path/to/template.p??.tp" to "path/to/template.p??"
filename_wo_tempita_suffix = filename.with_suffix("")
assert filename_wo_tempita_suffix in ignored_files
@@ -0,0 +1,88 @@
# Authors: Raghav RV <rvraghav93@gmail.com>
# License: BSD 3 clause
import pickle
import pytest
from sklearn.utils.deprecation import _is_deprecated, deprecated
@deprecated("qwerty")
class MockClass1:
pass
class MockClass2:
@deprecated("mockclass2_method")
def method(self):
pass
@deprecated("n_features_ is deprecated") # type: ignore
@property
def n_features_(self):
"""Number of input features."""
return 10
class MockClass3:
@deprecated()
def __init__(self):
pass
class MockClass4:
pass
class MockClass5(MockClass1):
"""Inherit from deprecated class but does not call super().__init__."""
def __init__(self, a):
self.a = a
@deprecated("a message")
class MockClass6:
"""A deprecated class that overrides __new__."""
def __new__(cls, *args, **kwargs):
assert len(args) > 0
return super().__new__(cls)
@deprecated()
def mock_function():
return 10
def test_deprecated():
with pytest.warns(FutureWarning, match="qwerty"):
MockClass1()
with pytest.warns(FutureWarning, match="mockclass2_method"):
MockClass2().method()
with pytest.warns(FutureWarning, match="deprecated"):
MockClass3()
with pytest.warns(FutureWarning, match="qwerty"):
MockClass5(42)
with pytest.warns(FutureWarning, match="a message"):
MockClass6(42)
with pytest.warns(FutureWarning, match="deprecated"):
val = mock_function()
assert val == 10
def test_is_deprecated():
# Test if _is_deprecated helper identifies wrapping via deprecated
# NOTE it works only for class methods and functions
assert _is_deprecated(MockClass1.__new__)
assert _is_deprecated(MockClass2().method)
assert _is_deprecated(MockClass3.__init__)
assert not _is_deprecated(MockClass4.__init__)
assert _is_deprecated(MockClass5.__new__)
assert _is_deprecated(mock_function)
def test_pickle():
pickle.loads(pickle.dumps(mock_function))
@@ -0,0 +1,274 @@
import pickle
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
@pytest.mark.parametrize(
"values, expected",
[
(np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")),
(
np.array([2, 1, np.nan, 1, np.nan], dtype="float32"),
np.array([1, 2, np.nan], dtype="float32"),
),
(
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
),
(
np.array(["b", "a", None, "a", None], dtype=object),
np.array(["a", "b", None], dtype=object),
),
(np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])),
],
ids=["int64", "float32-nan", "object", "object-None", "str"],
)
def test_encode_util(values, expected):
uniques = _unique(values)
assert_array_equal(uniques, expected)
result, encoded = _unique(values, return_inverse=True)
assert_array_equal(result, expected)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
result, counts = _unique(values, return_counts=True)
assert_array_equal(result, expected)
assert_array_equal(counts, np.array([2, 1, 2]))
result, encoded, counts = _unique(values, return_inverse=True, return_counts=True)
assert_array_equal(result, expected)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
assert_array_equal(counts, np.array([2, 1, 2]))
def test_encode_with_check_unknown():
# test for the check_unknown parameter of _encode()
uniques = np.array([1, 2, 3])
values = np.array([1, 2, 3, 4])
# Default is True, raise error
with pytest.raises(ValueError, match="y contains previously unseen labels"):
_encode(values, uniques=uniques, check_unknown=True)
# dont raise error if False
_encode(values, uniques=uniques, check_unknown=False)
# parameter is ignored for object dtype
uniques = np.array(["a", "b", "c"], dtype=object)
values = np.array(["a", "b", "c", "d"], dtype=object)
with pytest.raises(ValueError, match="y contains previously unseen labels"):
_encode(values, uniques=uniques, check_unknown=False)
def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
diff = _check_unknown(values, uniques)
assert_array_equal(diff, expected_diff)
diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
assert_array_equal(diff, expected_diff)
assert_array_equal(valid_mask, expected_mask)
@pytest.mark.parametrize(
"values, uniques, expected_diff, expected_mask",
[
(np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]),
(np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]),
(np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]),
(
np.array([2, 1, 4, np.nan]),
np.array([2, 5, 1, np.nan]),
[4],
[True, True, False, True],
),
(
np.array([2, 1, 4, np.nan]),
np.array([2, 5, 1]),
[4, np.nan],
[True, True, False, False],
),
(
np.array([2, 1, 4, 5]),
np.array([2, 5, 1, np.nan]),
[4],
[True, True, False, True],
),
(
np.array(["a", "b", "c", "d"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
np.array(["d"], dtype=object),
[True, True, True, False],
),
(
np.array(["d", "c", "a", "b"], dtype=object),
np.array(["a", "c", "b"], dtype=object),
np.array(["d"], dtype=object),
[False, True, True, True],
),
(
np.array(["a", "b", "c", "d"]),
np.array(["a", "b", "c"]),
np.array(["d"]),
[True, True, True, False],
),
(
np.array(["d", "c", "a", "b"]),
np.array(["a", "c", "b"]),
np.array(["d"]),
[False, True, True, True],
),
],
)
def test_check_unknown(values, uniques, expected_diff, expected_mask):
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
@pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")])
@pytest.mark.parametrize("pickle_uniques", [True, False])
def test_check_unknown_missing_values(missing_value, pickle_uniques):
# check for check_unknown with missing values with object dtypes
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
uniques = np.array(["c", "a", "b", missing_value], dtype=object)
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
expected_diff = ["d"]
expected_mask = [False, True, True, True, True]
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
uniques = np.array(["c", "a", "b"], dtype=object)
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
expected_diff = ["d", missing_value]
expected_mask = [False, True, True, True, False]
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
values = np.array(["a", missing_value], dtype=object)
uniques = np.array(["a", "b", "z"], dtype=object)
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
expected_diff = [missing_value]
expected_mask = [True, False]
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
@pytest.mark.parametrize("pickle_uniques", [True, False])
def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
# check for _unique and _encode with missing values with object dtypes
values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)
uniques = _unique(values)
if missing_value is None:
assert_array_equal(uniques, expected_uniques)
else: # missing_value == np.nan
assert_array_equal(uniques[:-1], expected_uniques[:-1])
assert np.isnan(uniques[-1])
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
def test_unique_util_missing_values_numeric():
# Check missing values in numerical values
values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
expected_inverse = np.array([1, 0, 3, 2, 1, 3])
uniques = _unique(values)
assert_array_equal(uniques, expected_uniques)
uniques, inverse = _unique(values, return_inverse=True)
assert_array_equal(uniques, expected_uniques)
assert_array_equal(inverse, expected_inverse)
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, expected_inverse)
def test_unique_util_with_all_missing_values():
# test for all types of missing values for object dtype
values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
uniques = _unique(values)
assert_array_equal(uniques[:-1], ["a", "c", None])
# last value is nan
assert np.isnan(uniques[-1])
expected_inverse = [3, 0, 1, 1, 2, 3, 2]
_, inverse = _unique(values, return_inverse=True)
assert_array_equal(inverse, expected_inverse)
def test_check_unknown_with_both_missing_values():
# test for both types of missing values for object dtype
values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object)
diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object))
assert diff[0] is None
assert np.isnan(diff[1])
diff, valid_mask = _check_unknown(
values, known_values=np.array(["a", "c"], dtype=object), return_mask=True
)
assert diff[0] is None
assert np.isnan(diff[1])
assert_array_equal(valid_mask, [False, True, True, True, False, False, False])
@pytest.mark.parametrize(
"values, uniques, expected_counts",
[
(np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]),
(
np.array([1] * 10 + [2] * 4 + [3] * 15),
np.array([1, 2, 3, 5]),
[10, 4, 15, 0],
),
(
np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
np.array([2, 3, np.nan]),
[4, 15, 10],
),
(
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["a", "b", "c"],
[16, 4, 20],
),
(
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["c", "b", "a"],
[20, 4, 16],
),
(
np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["c", np.nan, "a"],
[20, 4, 16],
),
(
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["a", "b", "c", "e"],
[16, 4, 20, 0],
),
],
)
def test_get_counts(values, uniques, expected_counts):
counts = _get_counts(values, uniques)
assert_array_equal(counts, expected_counts)
@@ -0,0 +1,518 @@
import html
import locale
import re
from contextlib import closing
from io import StringIO
from unittest.mock import patch
import pytest
from sklearn import config_context
from sklearn.base import BaseEstimator
from sklearn.cluster import AgglomerativeClustering, Birch
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import StackingClassifier, StackingRegressor, VotingClassifier
from sklearn.feature_selection import SelectPercentile
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.impute import SimpleImputer
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils._estimator_html_repr import (
_get_css_style,
_get_visual_block,
_HTMLDocumentationLinkMixin,
_write_label_html,
estimator_html_repr,
)
from sklearn.utils.fixes import parse_version
@pytest.mark.parametrize("checked", [True, False])
def test_write_label_html(checked):
# Test checking logic and labeling
name = "LogisticRegression"
tool_tip = "hello-world"
with closing(StringIO()) as out:
_write_label_html(out, name, tool_tip, checked=checked)
html_label = out.getvalue()
p = (
r'<label for="sk-estimator-id-[0-9]*"'
r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow ">'
r"LogisticRegression"
)
re_compiled = re.compile(p)
assert re_compiled.search(html_label)
assert html_label.startswith('<div class="sk-label-container">')
assert "<pre>hello-world</pre>" in html_label
if checked:
assert "checked>" in html_label
@pytest.mark.parametrize("est", ["passthrough", "drop", None])
def test_get_visual_block_single_str_none(est):
# Test estimators that are represented by strings
est_html_info = _get_visual_block(est)
assert est_html_info.kind == "single"
assert est_html_info.estimators == est
assert est_html_info.names == str(est)
assert est_html_info.name_details == str(est)
def test_get_visual_block_single_estimator():
est = LogisticRegression(C=10.0)
est_html_info = _get_visual_block(est)
assert est_html_info.kind == "single"
assert est_html_info.estimators == est
assert est_html_info.names == est.__class__.__name__
assert est_html_info.name_details == str(est)
def test_get_visual_block_pipeline():
pipe = Pipeline(
[
("imputer", SimpleImputer()),
("do_nothing", "passthrough"),
("do_nothing_more", None),
("classifier", LogisticRegression()),
]
)
est_html_info = _get_visual_block(pipe)
assert est_html_info.kind == "serial"
assert est_html_info.estimators == tuple(step[1] for step in pipe.steps)
assert est_html_info.names == [
"imputer: SimpleImputer",
"do_nothing: passthrough",
"do_nothing_more: passthrough",
"classifier: LogisticRegression",
]
assert est_html_info.name_details == [str(est) for _, est in pipe.steps]
def test_get_visual_block_feature_union():
f_union = FeatureUnion([("pca", PCA()), ("svd", TruncatedSVD())])
est_html_info = _get_visual_block(f_union)
assert est_html_info.kind == "parallel"
assert est_html_info.names == ("pca", "svd")
assert est_html_info.estimators == tuple(
trans[1] for trans in f_union.transformer_list
)
assert est_html_info.name_details == (None, None)
def test_get_visual_block_voting():
clf = VotingClassifier(
[("log_reg", LogisticRegression()), ("mlp", MLPClassifier())]
)
est_html_info = _get_visual_block(clf)
assert est_html_info.kind == "parallel"
assert est_html_info.estimators == tuple(trans[1] for trans in clf.estimators)
assert est_html_info.names == ("log_reg", "mlp")
assert est_html_info.name_details == (None, None)
def test_get_visual_block_column_transformer():
ct = ColumnTransformer(
[("pca", PCA(), ["num1", "num2"]), ("svd", TruncatedSVD, [0, 3])]
)
est_html_info = _get_visual_block(ct)
assert est_html_info.kind == "parallel"
assert est_html_info.estimators == tuple(trans[1] for trans in ct.transformers)
assert est_html_info.names == ("pca", "svd")
assert est_html_info.name_details == (["num1", "num2"], [0, 3])
def test_estimator_html_repr_pipeline():
num_trans = Pipeline(
steps=[("pass", "passthrough"), ("imputer", SimpleImputer(strategy="median"))]
)
cat_trans = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="constant", missing_values="empty")),
("one-hot", OneHotEncoder(drop="first")),
]
)
preprocess = ColumnTransformer(
[
("num", num_trans, ["a", "b", "c", "d", "e"]),
("cat", cat_trans, [0, 1, 2, 3]),
]
)
feat_u = FeatureUnion(
[
("pca", PCA(n_components=1)),
(
"tsvd",
Pipeline(
[
("first", TruncatedSVD(n_components=3)),
("select", SelectPercentile()),
]
),
),
]
)
clf = VotingClassifier(
[
("lr", LogisticRegression(solver="lbfgs", random_state=1)),
("mlp", MLPClassifier(alpha=0.001)),
]
)
pipe = Pipeline(
[("preprocessor", preprocess), ("feat_u", feat_u), ("classifier", clf)]
)
html_output = estimator_html_repr(pipe)
# top level estimators show estimator with changes
assert html.escape(str(pipe)) in html_output
for _, est in pipe.steps:
assert (
'<div class="sk-toggleable__content "><pre>' + html.escape(str(est))
) in html_output
# low level estimators do not show changes
with config_context(print_changed_only=True):
assert html.escape(str(num_trans["pass"])) in html_output
assert "passthrough</label>" in html_output
assert html.escape(str(num_trans["imputer"])) in html_output
for _, _, cols in preprocess.transformers:
assert f"<pre>{html.escape(str(cols))}</pre>" in html_output
# feature union
for name, _ in feat_u.transformer_list:
assert f"<label>{html.escape(name)}</label>" in html_output
pca = feat_u.transformer_list[0][1]
assert f"<pre>{html.escape(str(pca))}</pre>" in html_output
tsvd = feat_u.transformer_list[1][1]
first = tsvd["first"]
select = tsvd["select"]
assert f"<pre>{html.escape(str(first))}</pre>" in html_output
assert f"<pre>{html.escape(str(select))}</pre>" in html_output
# voting classifier
for name, est in clf.estimators:
assert f"<label>{html.escape(name)}</label>" in html_output
assert f"<pre>{html.escape(str(est))}</pre>" in html_output
# verify that prefers-color-scheme is implemented
assert "prefers-color-scheme" in html_output
@pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
def test_stacking_classifier(final_estimator):
estimators = [
("mlp", MLPClassifier(alpha=0.001)),
("tree", DecisionTreeClassifier()),
]
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
html_output = estimator_html_repr(clf)
assert html.escape(str(clf)) in html_output
# If final_estimator's default changes from LogisticRegression
# this should be updated
if final_estimator is None:
assert "LogisticRegression(" in html_output
else:
assert final_estimator.__class__.__name__ in html_output
@pytest.mark.parametrize("final_estimator", [None, LinearSVR()])
def test_stacking_regressor(final_estimator):
reg = StackingRegressor(
estimators=[("svr", LinearSVR())], final_estimator=final_estimator
)
html_output = estimator_html_repr(reg)
assert html.escape(str(reg.estimators[0][0])) in html_output
p = (
r'<label for="sk-estimator-id-[0-9]*"'
r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow ">'
r"&nbsp;LinearSVR"
)
re_compiled = re.compile(p)
assert re_compiled.search(html_output)
if final_estimator is None:
p = (
r'<label for="sk-estimator-id-[0-9]*"'
r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow ">'
r"&nbsp;RidgeCV"
)
re_compiled = re.compile(p)
assert re_compiled.search(html_output)
else:
assert html.escape(final_estimator.__class__.__name__) in html_output
def test_birch_duck_typing_meta():
# Test duck typing meta estimators with Birch
birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
html_output = estimator_html_repr(birch)
# inner estimators do not show changes
with config_context(print_changed_only=True):
assert f"<pre>{html.escape(str(birch.n_clusters))}" in html_output
assert "AgglomerativeClustering</label>" in html_output
# outer estimator contains all changes
assert f"<pre>{html.escape(str(birch))}" in html_output
def test_ovo_classifier_duck_typing_meta():
# Test duck typing metaestimators with OVO
ovo = OneVsOneClassifier(LinearSVC(penalty="l1"))
html_output = estimator_html_repr(ovo)
# inner estimators do not show changes
with config_context(print_changed_only=True):
assert f"<pre>{html.escape(str(ovo.estimator))}" in html_output
# regex to match the start of the tag
p = (
r'<label for="sk-estimator-id-[0-9]*" '
r'class="sk-toggleable__label sk-toggleable__label-arrow ">&nbsp;LinearSVC'
)
re_compiled = re.compile(p)
assert re_compiled.search(html_output)
# outer estimator
assert f"<pre>{html.escape(str(ovo))}" in html_output
def test_duck_typing_nested_estimator():
# Test duck typing metaestimators with random search
kernel_ridge = KernelRidge(kernel=ExpSineSquared())
param_distributions = {"alpha": [1, 2]}
kernel_ridge_tuned = RandomizedSearchCV(
kernel_ridge,
param_distributions=param_distributions,
)
html_output = estimator_html_repr(kernel_ridge_tuned)
assert "estimator: KernelRidge</label>" in html_output
@pytest.mark.parametrize("print_changed_only", [True, False])
def test_one_estimator_print_change_only(print_changed_only):
pca = PCA(n_components=10)
with config_context(print_changed_only=print_changed_only):
pca_repr = html.escape(str(pca))
html_output = estimator_html_repr(pca)
assert pca_repr in html_output
def test_fallback_exists():
"""Check that repr fallback is in the HTML."""
pca = PCA(n_components=10)
html_output = estimator_html_repr(pca)
assert (
f'<div class="sk-text-repr-fallback"><pre>{html.escape(str(pca))}'
in html_output
)
def test_show_arrow_pipeline():
"""Show arrow in pipeline for top level in pipeline"""
pipe = Pipeline([("scale", StandardScaler()), ("log_Reg", LogisticRegression())])
html_output = estimator_html_repr(pipe)
assert (
'class="sk-toggleable__label sk-toggleable__label-arrow ">&nbsp;&nbsp;Pipeline'
in html_output
)
def test_invalid_parameters_in_stacking():
"""Invalidate stacking configuration uses default repr.
Non-regression test for #24009.
"""
stacker = StackingClassifier(estimators=[])
html_output = estimator_html_repr(stacker)
assert html.escape(str(stacker)) in html_output
def test_estimator_get_params_return_cls():
"""Check HTML repr works where a value in get_params is a class."""
class MyEstimator:
def get_params(self, deep=False):
return {"inner_cls": LogisticRegression}
est = MyEstimator()
assert "MyEstimator" in estimator_html_repr(est)
def test_estimator_html_repr_unfitted_vs_fitted():
"""Check that we have the information that the estimator is fitted or not in the
HTML representation.
"""
class MyEstimator(BaseEstimator):
def fit(self, X, y):
self.fitted_ = True
return self
X, y = load_iris(return_X_y=True)
estimator = MyEstimator()
assert "<span>Not fitted</span>" in estimator_html_repr(estimator)
estimator.fit(X, y)
assert "<span>Fitted</span>" in estimator_html_repr(estimator)
@pytest.mark.parametrize(
"estimator",
[
LogisticRegression(),
make_pipeline(StandardScaler(), LogisticRegression()),
make_pipeline(
make_column_transformer((StandardScaler(), slice(0, 3))),
LogisticRegression(),
),
],
)
def test_estimator_html_repr_fitted_icon(estimator):
"""Check that we are showing the fitted status icon only once."""
pattern = '<span class="sk-estimator-doc-link ">i<span>Not fitted</span></span>'
assert estimator_html_repr(estimator).count(pattern) == 1
X, y = load_iris(return_X_y=True)
estimator.fit(X, y)
pattern = '<span class="sk-estimator-doc-link fitted">i<span>Fitted</span></span>'
assert estimator_html_repr(estimator).count(pattern) == 1
@pytest.mark.parametrize("mock_version", ["1.3.0.dev0", "1.3.0"])
def test_html_documentation_link_mixin_sklearn(mock_version):
"""Check the behaviour of the `_HTMLDocumentationLinkMixin` class for scikit-learn
default.
"""
# mock the `__version__` where the mixin is located
with patch("sklearn.utils._estimator_html_repr.__version__", mock_version):
mixin = _HTMLDocumentationLinkMixin()
assert mixin._doc_link_module == "sklearn"
sklearn_version = parse_version(mock_version)
# we need to parse the version manually to be sure that this test is passing in
# other branches than `main` (that is "dev").
if sklearn_version.dev is None:
version = f"{sklearn_version.major}.{sklearn_version.minor}"
else:
version = "dev"
assert (
mixin._doc_link_template
== f"https://scikit-learn.org/{version}/modules/generated/"
"{estimator_module}.{estimator_name}.html"
)
assert (
mixin._get_doc_link()
== f"https://scikit-learn.org/{version}/modules/generated/"
"sklearn.utils._HTMLDocumentationLinkMixin.html"
)
@pytest.mark.parametrize(
"module_path,expected_module",
[
("prefix.mymodule", "prefix.mymodule"),
("prefix._mymodule", "prefix"),
("prefix.mypackage._mymodule", "prefix.mypackage"),
("prefix.mypackage._mymodule.submodule", "prefix.mypackage"),
("prefix.mypackage.mymodule.submodule", "prefix.mypackage.mymodule.submodule"),
],
)
def test_html_documentation_link_mixin_get_doc_link(module_path, expected_module):
"""Check the behaviour of the `_get_doc_link` with various parameter."""
class FooBar(_HTMLDocumentationLinkMixin):
pass
FooBar.__module__ = module_path
est = FooBar()
# if we set `_doc_link`, then we expect to infer a module and name for the estimator
est._doc_link_module = "prefix"
est._doc_link_template = (
"https://website.com/{estimator_module}.{estimator_name}.html"
)
assert est._get_doc_link() == f"https://website.com/{expected_module}.FooBar.html"
def test_html_documentation_link_mixin_get_doc_link_out_of_library():
"""Check the behaviour of the `_get_doc_link` with various parameter."""
mixin = _HTMLDocumentationLinkMixin()
# if the `_doc_link_module` does not refer to the root module of the estimator
# (here the mixin), then we should return an empty string.
mixin._doc_link_module = "xxx"
assert mixin._get_doc_link() == ""
def test_html_documentation_link_mixin_doc_link_url_param_generator():
mixin = _HTMLDocumentationLinkMixin()
# we can bypass the generation by providing our own callable
mixin._doc_link_template = (
"https://website.com/{my_own_variable}.{another_variable}.html"
)
def url_param_generator(estimator):
return {
"my_own_variable": "value_1",
"another_variable": "value_2",
}
mixin._doc_link_url_param_generator = url_param_generator
assert mixin._get_doc_link() == "https://website.com/value_1.value_2.html"
@pytest.fixture
def set_non_utf8_locale():
"""Pytest fixture to set non utf-8 locale during the test.
The locale is set to the original one after the test has run.
"""
try:
locale.setlocale(locale.LC_CTYPE, "C")
except locale.Error:
pytest.skip("'C' locale is not available on this OS")
yield
# Resets the locale to the original one. Python calls setlocale(LC_TYPE, "")
# at startup according to
# https://docs.python.org/3/library/locale.html#background-details-hints-tips-and-caveats.
# This assumes that no other locale changes have been made. For some reason,
# on some platforms, trying to restore locale with something like
# locale.setlocale(locale.LC_CTYPE, locale.getlocale()) raises a
# locale.Error: unsupported locale setting
locale.setlocale(locale.LC_CTYPE, "")
def test_non_utf8_locale(set_non_utf8_locale):
"""Checks that utf8 encoding is used when reading the CSS file.
Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/27725
"""
_get_css_style()
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,47 @@
"""Test fast_dict."""
import numpy as np
from numpy.testing import assert_allclose, assert_array_equal
from sklearn.utils._fast_dict import IntFloatDict, argmin
def test_int_float_dict():
rng = np.random.RandomState(0)
keys = np.unique(rng.randint(100, size=10).astype(np.intp))
values = rng.rand(len(keys))
d = IntFloatDict(keys, values)
for key, value in zip(keys, values):
assert d[key] == value
assert len(d) == len(keys)
d.append(120, 3.0)
assert d[120] == 3.0
assert len(d) == len(keys) + 1
for i in range(2000):
d.append(i + 1000, 4.0)
assert d[1100] == 4.0
def test_int_float_dict_argmin():
# Test the argmin implementation on the IntFloatDict
keys = np.arange(100, dtype=np.intp)
values = np.arange(100, dtype=np.float64)
d = IntFloatDict(keys, values)
assert argmin(d) == (0, 0)
def test_to_arrays():
# Test that an IntFloatDict is converted into arrays
# of keys and values correctly
keys_in = np.array([1, 2, 3], dtype=np.intp)
values_in = np.array([4, 5, 6], dtype=np.float64)
d = IntFloatDict(keys_in, values_in)
keys_out, values_out = d.to_arrays()
assert keys_out.dtype == keys_in.dtype
assert values_in.dtype == values_out.dtype
assert_array_equal(keys_out, keys_in)
assert_allclose(values_out, values_in)
@@ -0,0 +1,162 @@
# Authors: Gael Varoquaux <gael.varoquaux@normalesup.org>
# Justin Vincent
# Lars Buitinck
# License: BSD 3 clause
import numpy as np
import pytest
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import _object_dtype_isnan, _smallest_admissible_index_dtype
@pytest.mark.parametrize("dtype, val", ([object, 1], [object, "a"], [float, 1]))
def test_object_dtype_isnan(dtype, val):
X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype)
expected_mask = np.array([[False, True], [True, False]])
mask = _object_dtype_isnan(X)
assert_array_equal(mask, expected_mask)
@pytest.mark.parametrize(
"params, expected_dtype",
[
({}, np.int32), # default behaviour
({"maxval": np.iinfo(np.int32).max}, np.int32),
({"maxval": np.iinfo(np.int32).max + 1}, np.int64),
],
)
def test_smallest_admissible_index_dtype_max_val(params, expected_dtype):
"""Check the behaviour of `smallest_admissible_index_dtype` depending only on the
`max_val` parameter.
"""
assert _smallest_admissible_index_dtype(**params) == expected_dtype
@pytest.mark.parametrize(
"params, expected_dtype",
[
# Arrays dtype is int64 and thus should not be downcasted to int32 without
# checking the content of providing maxval.
({"arrays": np.array([1, 2], dtype=np.int64)}, np.int64),
# One of the array is int64 and should not be downcasted to int32
# for the same reasons.
(
{
"arrays": (
np.array([1, 2], dtype=np.int32),
np.array([1, 2], dtype=np.int64),
)
},
np.int64,
),
# Both arrays are already int32: we can just keep this dtype.
(
{
"arrays": (
np.array([1, 2], dtype=np.int32),
np.array([1, 2], dtype=np.int32),
)
},
np.int32,
),
# Arrays should be upcasted to at least int32 precision.
({"arrays": np.array([1, 2], dtype=np.int8)}, np.int32),
# Check that `maxval` takes precedence over the arrays and thus upcast to
# int64.
(
{
"arrays": np.array([1, 2], dtype=np.int32),
"maxval": np.iinfo(np.int32).max + 1,
},
np.int64,
),
],
)
def test_smallest_admissible_index_dtype_without_checking_contents(
params, expected_dtype
):
"""Check the behaviour of `smallest_admissible_index_dtype` using the passed
arrays but without checking the contents of the arrays.
"""
assert _smallest_admissible_index_dtype(**params) == expected_dtype
@pytest.mark.parametrize(
"params, expected_dtype",
[
# empty arrays should always be converted to int32 indices
(
{
"arrays": (np.array([], dtype=np.int64), np.array([], dtype=np.int64)),
"check_contents": True,
},
np.int32,
),
# arrays respecting np.iinfo(np.int32).min < x < np.iinfo(np.int32).max should
# be converted to int32,
(
{"arrays": np.array([1], dtype=np.int64), "check_contents": True},
np.int32,
),
# otherwise, it should be converted to int64. We need to create a uint32
# arrays to accommodate a value > np.iinfo(np.int32).max
(
{
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
"check_contents": True,
},
np.int64,
),
# maxval should take precedence over the arrays contents and thus upcast to
# int64.
(
{
"arrays": np.array([1], dtype=np.int32),
"check_contents": True,
"maxval": np.iinfo(np.int32).max + 1,
},
np.int64,
),
# when maxval is small, but check_contents is True and the contents
# require np.int64, we still require np.int64 indexing in the end.
(
{
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
"check_contents": True,
"maxval": 1,
},
np.int64,
),
],
)
def test_smallest_admissible_index_dtype_by_checking_contents(params, expected_dtype):
"""Check the behaviour of `smallest_admissible_index_dtype` using the dtype of the
arrays but as well the contents.
"""
assert _smallest_admissible_index_dtype(**params) == expected_dtype
@pytest.mark.parametrize(
"params, err_type, err_msg",
[
(
{"maxval": np.iinfo(np.int64).max + 1},
ValueError,
"is to large to be represented as np.int64",
),
(
{"arrays": np.array([1, 2], dtype=np.float64)},
ValueError,
"Array dtype float64 is not supported",
),
({"arrays": [1, 2]}, TypeError, "Arrays should be of type np.ndarray"),
],
)
def test_smallest_admissible_index_dtype_error(params, err_type, err_msg):
"""Check that we raise the proper error message."""
with pytest.raises(err_type, match=err_msg):
_smallest_admissible_index_dtype(**params)
@@ -0,0 +1,80 @@
import numpy as np
import pytest
from scipy.sparse.csgraph import connected_components
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import kneighbors_graph
from sklearn.utils.graph import _fix_connected_components
def test_fix_connected_components():
# Test that _fix_connected_components reduces the number of component to 1.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
n_connected_components, labels = connected_components(graph)
assert n_connected_components > 1
graph = _fix_connected_components(X, graph, n_connected_components, labels)
n_connected_components, labels = connected_components(graph)
assert n_connected_components == 1
def test_fix_connected_components_precomputed():
# Test that _fix_connected_components accepts precomputed distance matrix.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
n_connected_components, labels = connected_components(graph)
assert n_connected_components > 1
distances = pairwise_distances(X)
graph = _fix_connected_components(
distances, graph, n_connected_components, labels, metric="precomputed"
)
n_connected_components, labels = connected_components(graph)
assert n_connected_components == 1
# but it does not work with precomputed neighbors graph
with pytest.raises(RuntimeError, match="does not work with a sparse"):
_fix_connected_components(
graph, graph, n_connected_components, labels, metric="precomputed"
)
def test_fix_connected_components_wrong_mode():
# Test that the an error is raised if the mode string is incorrect.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
n_connected_components, labels = connected_components(graph)
with pytest.raises(ValueError, match="Unknown mode"):
graph = _fix_connected_components(
X, graph, n_connected_components, labels, mode="foo"
)
def test_fix_connected_components_connectivity_mode():
# Test that the connectivity mode fill new connections with ones.
X = np.array([0, 1, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=1, mode="connectivity")
n_connected_components, labels = connected_components(graph)
graph = _fix_connected_components(
X, graph, n_connected_components, labels, mode="connectivity"
)
assert np.all(graph.data == 1)
def test_fix_connected_components_distance_mode():
# Test that the distance mode does not fill new connections with ones.
X = np.array([0, 1, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=1, mode="distance")
assert np.all(graph.data == 1)
n_connected_components, labels = connected_components(graph)
graph = _fix_connected_components(
X, graph, n_connected_components, labels, mode="distance"
)
assert not np.all(graph.data == 1)
@@ -0,0 +1,594 @@
import warnings
from copy import copy
from unittest import SkipTest
import numpy as np
import pytest
import sklearn
from sklearn.externals._packaging.version import parse as parse_version
from sklearn.utils import _safe_indexing, resample, shuffle
from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
from sklearn.utils._indexing import (
_determine_key_type,
_get_column_indices,
_safe_assign,
)
from sklearn.utils._mocking import MockDataFrame
from sklearn.utils._testing import (
_array_api_for_tests,
_convert_container,
assert_allclose_dense_sparse,
assert_array_equal,
skip_if_array_api_compat_not_configured,
)
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
# toy array
X_toy = np.arange(9).reshape((3, 3))
def test_polars_indexing():
"""Check _safe_indexing for polars as expected."""
pl = pytest.importorskip("polars", minversion="0.18.2")
df = pl.DataFrame(
{"a": [1, 2, 3, 4], "b": [4, 5, 6, 8], "c": [1, 4, 1, 10]}, orient="row"
)
from polars.testing import assert_frame_equal
str_keys = [["b"], ["a", "b"], ["b", "a", "c"], ["c"], ["a"]]
for key in str_keys:
out = _safe_indexing(df, key, axis=1)
assert_frame_equal(df[key], out)
bool_keys = [([True, False, True], ["a", "c"]), ([False, False, True], ["c"])]
for bool_key, str_key in bool_keys:
out = _safe_indexing(df, bool_key, axis=1)
assert_frame_equal(df[:, str_key], out)
int_keys = [([0, 1], ["a", "b"]), ([2], ["c"])]
for int_key, str_key in int_keys:
out = _safe_indexing(df, int_key, axis=1)
assert_frame_equal(df[:, str_key], out)
axis_0_keys = [[0, 1], [1, 3], [3, 2]]
for key in axis_0_keys:
out = _safe_indexing(df, key, axis=0)
assert_frame_equal(df[key], out)
@pytest.mark.parametrize(
"key, dtype",
[
(0, "int"),
("0", "str"),
(True, "bool"),
(np.bool_(True), "bool"),
([0, 1, 2], "int"),
(["0", "1", "2"], "str"),
((0, 1, 2), "int"),
(("0", "1", "2"), "str"),
(slice(None, None), None),
(slice(0, 2), "int"),
(np.array([0, 1, 2], dtype=np.int32), "int"),
(np.array([0, 1, 2], dtype=np.int64), "int"),
(np.array([0, 1, 2], dtype=np.uint8), "int"),
([True, False], "bool"),
((True, False), "bool"),
(np.array([True, False]), "bool"),
("col_0", "str"),
(["col_0", "col_1", "col_2"], "str"),
(("col_0", "col_1", "col_2"), "str"),
(slice("begin", "end"), "str"),
(np.array(["col_0", "col_1", "col_2"]), "str"),
(np.array(["col_0", "col_1", "col_2"], dtype=object), "str"),
],
)
def test_determine_key_type(key, dtype):
assert _determine_key_type(key) == dtype
def test_determine_key_type_error():
with pytest.raises(ValueError, match="No valid specification of the"):
_determine_key_type(1.0)
def test_determine_key_type_slice_error():
with pytest.raises(TypeError, match="Only array-like or scalar are"):
_determine_key_type(slice(0, 2, 1), accept_slice=False)
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize(
"array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
)
def test_determine_key_type_array_api(array_namespace, device, dtype_name):
xp = _array_api_for_tests(array_namespace, device)
with sklearn.config_context(array_api_dispatch=True):
int_array_key = xp.asarray([1, 2, 3])
assert _determine_key_type(int_array_key) == "int"
bool_array_key = xp.asarray([True, False, True])
assert _determine_key_type(bool_array_key) == "bool"
try:
complex_array_key = xp.asarray([1 + 1j, 2 + 2j, 3 + 3j])
except TypeError:
# Complex numbers are not supported by all Array API libraries.
complex_array_key = None
if complex_array_key is not None:
with pytest.raises(ValueError, match="No valid specification of the"):
_determine_key_type(complex_array_key)
@pytest.mark.parametrize(
"array_type", ["list", "array", "sparse", "dataframe", "polars"]
)
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
indices = [1, 2]
if indices_type == "slice" and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(
subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
)
@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
def test_safe_indexing_1d_container(array_type, indices_type):
indices = [1, 2]
if indices_type == "slice" and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
# validation of the indices
# we make a copy because indices is mutable and shared between tests
indices_converted = copy(indices)
if indices_type == "slice" and isinstance(indices[1], int):
indices_converted[1] += 1
columns_name = ["col_0", "col_1", "col_2"]
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices_converted = _convert_container(indices_converted, indices_type)
if isinstance(indices[0], str) and array_type not in ("dataframe", "polars"):
err_msg = (
"Specifying the columns using strings is only supported for dataframes"
)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices_converted, axis=1)
else:
subset = _safe_indexing(array, indices_converted, axis=1)
assert_allclose_dense_sparse(
subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
)
@pytest.mark.parametrize("array_read_only", [True, False])
@pytest.mark.parametrize("indices_read_only", [True, False])
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
@pytest.mark.parametrize("indices_type", ["array", "series"])
@pytest.mark.parametrize(
"axis, expected_array", [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
)
def test_safe_indexing_2d_read_only_axis_1(
array_read_only, indices_read_only, array_type, indices_type, axis, expected_array
):
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
if array_read_only:
array.setflags(write=False)
array = _convert_container(array, array_type)
indices = np.array([1, 2])
if indices_read_only:
indices.setflags(write=False)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
def test_safe_indexing_1d_container_mask(array_type, indices_type):
indices = [False] + [True] * 2 + [False] * 6
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
@pytest.mark.parametrize(
"axis, expected_subset",
[(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])],
)
def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset):
columns_name = ["col_0", "col_1", "col_2"]
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices = [False, True, True]
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(
subset, _convert_container(expected_subset, array_type)
)
@pytest.mark.parametrize(
"array_type, expected_output_type",
[
("list", "list"),
("array", "array"),
("sparse", "sparse"),
("dataframe", "series"),
("polars", "polars_series"),
],
)
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
expected_array = _convert_container([7, 8, 9], expected_output_type)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
def test_safe_indexing_1d_scalar(array_type):
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
assert subset == 3
@pytest.mark.parametrize(
"array_type, expected_output_type",
[
("array", "array"),
("sparse", "sparse"),
("dataframe", "series"),
("polars", "polars_series"),
],
)
@pytest.mark.parametrize("indices", [2, "col_2"])
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices):
columns_name = ["col_0", "col_1", "col_2"]
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
if isinstance(indices, str) and array_type not in ("dataframe", "polars"):
err_msg = (
"Specifying the columns using strings is only supported for dataframes"
)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=1)
else:
subset = _safe_indexing(array, indices, axis=1)
expected_output = [3, 6, 9]
if expected_output_type == "sparse":
# sparse matrix are keeping the 2D shape
expected_output = [[3], [6], [9]]
expected_array = _convert_container(expected_output, expected_output_type)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
def test_safe_indexing_None_axis_0(array_type):
X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
X_subset = _safe_indexing(X, None, axis=0)
assert_allclose_dense_sparse(X_subset, X)
def test_safe_indexing_pandas_no_matching_cols_error():
pd = pytest.importorskip("pandas")
err_msg = "No valid specification of the columns."
X = pd.DataFrame(X_toy)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X, [1.0], axis=1)
@pytest.mark.parametrize("axis", [None, 3])
def test_safe_indexing_error_axis(axis):
with pytest.raises(ValueError, match="'axis' should be either 0"):
_safe_indexing(X_toy, [0, 1], axis=axis)
@pytest.mark.parametrize("X_constructor", ["array", "series", "polars_series"])
def test_safe_indexing_1d_array_error(X_constructor):
# check that we are raising an error if the array-like passed is 1D and
# we try to index on the 2nd dimension
X = list(range(5))
if X_constructor == "array":
X_constructor = np.asarray(X)
elif X_constructor == "series":
pd = pytest.importorskip("pandas")
X_constructor = pd.Series(X)
elif X_constructor == "polars_series":
pl = pytest.importorskip("polars")
X_constructor = pl.Series(values=X)
err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or dataframe"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X_constructor, [0, 1], axis=1)
def test_safe_indexing_container_axis_0_unsupported_type():
indices = ["col_1", "col_2"]
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
err_msg = "String indexing is not supported with 'axis=0'"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=0)
def test_safe_indexing_pandas_no_settingwithcopy_warning():
# Using safe_indexing with an array-like indexer gives a copy of the
# DataFrame -> ensure it doesn't raise a warning if modified
pd = pytest.importorskip("pandas")
pd_version = parse_version(pd.__version__)
pd_base_version = parse_version(pd_version.base_version)
if pd_base_version >= parse_version("3"):
raise SkipTest("SettingWithCopyWarning has been removed in pandas 3.0.0.dev")
X = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
subset = _safe_indexing(X, [0, 1], axis=0)
if hasattr(pd.errors, "SettingWithCopyWarning"):
SettingWithCopyWarning = pd.errors.SettingWithCopyWarning
else:
# backward compatibility for pandas < 1.5
SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning
with warnings.catch_warnings():
warnings.simplefilter("error", SettingWithCopyWarning)
subset.iloc[0, 0] = 10
# The original dataframe is unaffected by the assignment on the subset:
assert X.iloc[0, 0] == 1
@pytest.mark.parametrize("indices", [0, [0, 1], slice(0, 2), np.array([0, 1])])
def test_safe_indexing_list_axis_1_unsupported(indices):
"""Check that we raise a ValueError when axis=1 with input as list."""
X = [[1, 2], [4, 5], [7, 8]]
err_msg = "axis=1 is not supported for lists"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X, indices, axis=1)
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
def test_safe_assign(array_type):
"""Check that `_safe_assign` works as expected."""
rng = np.random.RandomState(0)
X_array = rng.randn(10, 5)
row_indexer = [1, 2]
values = rng.randn(len(row_indexer), X_array.shape[1])
X = _convert_container(X_array, array_type)
_safe_assign(X, values, row_indexer=row_indexer)
assigned_portion = _safe_indexing(X, row_indexer, axis=0)
assert_allclose_dense_sparse(
assigned_portion, _convert_container(values, array_type)
)
column_indexer = [1, 2]
values = rng.randn(X_array.shape[0], len(column_indexer))
X = _convert_container(X_array, array_type)
_safe_assign(X, values, column_indexer=column_indexer)
assigned_portion = _safe_indexing(X, column_indexer, axis=1)
assert_allclose_dense_sparse(
assigned_portion, _convert_container(values, array_type)
)
row_indexer, column_indexer = None, None
values = rng.randn(*X.shape)
X = _convert_container(X_array, array_type)
_safe_assign(X, values, column_indexer=column_indexer)
assert_allclose_dense_sparse(X, _convert_container(values, array_type))
@pytest.mark.parametrize(
"key, err_msg",
[
(10, r"all features must be in \[0, 2\]"),
("whatever", "A given column is not a column of the dataframe"),
(object(), "No valid specification of the columns"),
],
)
def test_get_column_indices_error(key, err_msg):
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(X_toy, columns=["col_0", "col_1", "col_2"])
with pytest.raises(ValueError, match=err_msg):
_get_column_indices(X_df, key)
@pytest.mark.parametrize(
"key", [["col1"], ["col2"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]
)
def test_get_column_indices_pandas_nonunique_columns_error(key):
pd = pytest.importorskip("pandas")
toy = np.zeros((1, 5), dtype=int)
columns = ["col1", "col1", "col2", "col3", "col2"]
X = pd.DataFrame(toy, columns=columns)
err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
with pytest.raises(ValueError) as exc_info:
_get_column_indices(X, key)
assert str(exc_info.value) == err_msg
def test_get_column_indices_interchange():
"""Check _get_column_indices for edge cases with the interchange"""
pd = pytest.importorskip("pandas", minversion="1.5")
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
# Hide the fact that this is a pandas dataframe to trigger the dataframe protocol
# code path.
class MockDataFrame:
def __init__(self, df):
self._df = df
def __getattr__(self, name):
return getattr(self._df, name)
df_mocked = MockDataFrame(df)
key_results = [
(slice(1, None), [1, 2]),
(slice(None, 2), [0, 1]),
(slice(1, 2), [1]),
(["b", "c"], [1, 2]),
(slice("a", "b"), [0, 1]),
(slice("a", None), [0, 1, 2]),
(slice(None, "a"), [0]),
(["c", "a"], [2, 0]),
([], []),
]
for key, result in key_results:
assert _get_column_indices(df_mocked, key) == result
msg = "A given column is not a column of the dataframe"
with pytest.raises(ValueError, match=msg):
_get_column_indices(df_mocked, ["not_a_column"])
msg = "key.step must be 1 or None"
with pytest.raises(NotImplementedError, match=msg):
_get_column_indices(df_mocked, slice("a", None, 2))
def test_resample():
# Border case not worth mentioning in doctests
assert resample() is None
# Check that invalid arguments yield ValueError
with pytest.raises(ValueError):
resample([0], [0, 1])
with pytest.raises(ValueError):
resample([0, 1], [0, 1], replace=False, n_samples=3)
# Issue:6581, n_samples can be more when replace is True (default).
assert len(resample([1, 2], n_samples=5)) == 5
def test_resample_stratified():
# Make sure resample can stratify
rng = np.random.RandomState(0)
n_samples = 100
p = 0.9
X = rng.normal(size=(n_samples, 1))
y = rng.binomial(1, p, size=n_samples)
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0, stratify=None)
assert np.all(y_not_stratified == 1)
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
assert not np.all(y_stratified == 1)
assert np.sum(y_stratified) == 9 # all 1s, one 0
def test_resample_stratified_replace():
# Make sure stratified resampling supports the replace parameter
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=n_samples)
X_replace, _ = resample(
X, y, replace=True, n_samples=50, random_state=rng, stratify=y
)
X_no_replace, _ = resample(
X, y, replace=False, n_samples=50, random_state=rng, stratify=y
)
assert np.unique(X_replace).shape[0] < 50
assert np.unique(X_no_replace).shape[0] == 50
# make sure n_samples can be greater than X.shape[0] if we sample with
# replacement
X_replace, _ = resample(
X, y, replace=True, n_samples=1000, random_state=rng, stratify=y
)
assert X_replace.shape[0] == 1000
assert np.unique(X_replace).shape[0] == 100
def test_resample_stratify_2dy():
# Make sure y can be 2d when stratifying
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=(n_samples, 2))
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
assert y.ndim == 2
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_resample_stratify_sparse_error(csr_container):
# resample must be ndarray
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 2))
y = rng.randint(0, 2, size=n_samples)
stratify = csr_container(y.reshape(-1, 1))
with pytest.raises(TypeError, match="Sparse data was passed"):
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
def test_shuffle_on_ndim_equals_three():
def to_tuple(A): # to make the inner arrays hashable
return tuple(tuple(tuple(C) for C in B) for B in A)
A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
S = set(to_tuple(A))
shuffle(A) # shouldn't raise a ValueError for dim = 3
assert set(to_tuple(A)) == S
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_shuffle_dont_convert_to_array(csc_container):
# Check that shuffle does not try to convert to numpy arrays with float
# dtypes can let any indexable datastructure pass-through.
a = ["a", "b", "c"]
b = np.array(["a", "b", "c"], dtype=object)
c = [1, 2, 3]
d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object))
e = csc_container(np.arange(6).reshape(3, 2))
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
assert a_s == ["c", "b", "a"]
assert type(a_s) == list # noqa: E721
assert_array_equal(b_s, ["c", "b", "a"])
assert b_s.dtype == object
assert c_s == [3, 2, 1]
assert type(c_s) == list # noqa: E721
assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
assert type(d_s) == MockDataFrame # noqa: E721
assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))

Some files were not shown because too many files have changed in this diff Show More