feat: initial commit - Phase 1 & 2 core features

2026-04-22 17:07:33 +08:00
commit 1773bda06b
25005 changed files with 6252106 additions and 0 deletions
@@ -0,0 +1,100 @@
+"""
+The :mod:`sklearn.linear_model` module implements a variety of linear models.
+"""
+
+# See http://scikit-learn.sourceforge.net/modules/sgd.html and
+# http://scikit-learn.sourceforge.net/modules/linear_model.html for
+# complete documentation.
+
+from ._base import LinearRegression
+from ._bayes import ARDRegression, BayesianRidge
+from ._coordinate_descent import (
+    ElasticNet,
+    ElasticNetCV,
+    Lasso,
+    LassoCV,
+    MultiTaskElasticNet,
+    MultiTaskElasticNetCV,
+    MultiTaskLasso,
+    MultiTaskLassoCV,
+    enet_path,
+    lasso_path,
+)
+from ._glm import GammaRegressor, PoissonRegressor, TweedieRegressor
+from ._huber import HuberRegressor
+from ._least_angle import (
+    Lars,
+    LarsCV,
+    LassoLars,
+    LassoLarsCV,
+    LassoLarsIC,
+    lars_path,
+    lars_path_gram,
+)
+from ._logistic import LogisticRegression, LogisticRegressionCV
+from ._omp import (
+    OrthogonalMatchingPursuit,
+    OrthogonalMatchingPursuitCV,
+    orthogonal_mp,
+    orthogonal_mp_gram,
+)
+from ._passive_aggressive import PassiveAggressiveClassifier, PassiveAggressiveRegressor
+from ._perceptron import Perceptron
+from ._quantile import QuantileRegressor
+from ._ransac import RANSACRegressor
+from ._ridge import Ridge, RidgeClassifier, RidgeClassifierCV, RidgeCV, ridge_regression
+from ._sgd_fast import Hinge, Huber, Log, ModifiedHuber, SquaredLoss
+from ._stochastic_gradient import SGDClassifier, SGDOneClassSVM, SGDRegressor
+from ._theil_sen import TheilSenRegressor
+
+__all__ = [
+    "ARDRegression",
+    "BayesianRidge",
+    "ElasticNet",
+    "ElasticNetCV",
+    "Hinge",
+    "Huber",
+    "HuberRegressor",
+    "Lars",
+    "LarsCV",
+    "Lasso",
+    "LassoCV",
+    "LassoLars",
+    "LassoLarsCV",
+    "LassoLarsIC",
+    "LinearRegression",
+    "Log",
+    "LogisticRegression",
+    "LogisticRegressionCV",
+    "ModifiedHuber",
+    "MultiTaskElasticNet",
+    "MultiTaskElasticNetCV",
+    "MultiTaskLasso",
+    "MultiTaskLassoCV",
+    "OrthogonalMatchingPursuit",
+    "OrthogonalMatchingPursuitCV",
+    "PassiveAggressiveClassifier",
+    "PassiveAggressiveRegressor",
+    "Perceptron",
+    "QuantileRegressor",
+    "Ridge",
+    "RidgeCV",
+    "RidgeClassifier",
+    "RidgeClassifierCV",
+    "SGDClassifier",
+    "SGDRegressor",
+    "SGDOneClassSVM",
+    "SquaredLoss",
+    "TheilSenRegressor",
+    "enet_path",
+    "lars_path",
+    "lars_path_gram",
+    "lasso_path",
+    "orthogonal_mp",
+    "orthogonal_mp_gram",
+    "ridge_regression",
+    "RANSACRegressor",
+    "PoissonRegressor",
+    "GammaRegressor",
+    "TweedieRegressor",
+]
@@ -0,0 +1,845 @@
+"""
+Generalized Linear Models.
+"""
+
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+# Fabian Pedregosa <fabian.pedregosa@inria.fr>
+# Olivier Grisel <olivier.grisel@ensta.org>
+#         Vincent Michel <vincent.michel@inria.fr>
+#         Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#         Mathieu Blondel <mathieu@mblondel.org>
+#         Lars Buitinck
+#         Maryan Morel <maryan.morel@polytechnique.edu>
+#         Giorgio Patrini <giorgio.patrini@anu.edu.au>
+#         Maria Telenczuk <https://github.com/maikia>
+# License: BSD 3 clause
+
+import numbers
+import warnings
+from abc import ABCMeta, abstractmethod
+from numbers import Integral
+
+import numpy as np
+import scipy.sparse as sp
+from scipy import linalg, optimize, sparse
+from scipy.sparse.linalg import lsqr
+from scipy.special import expit
+
+from ..base import (
+    BaseEstimator,
+    ClassifierMixin,
+    MultiOutputMixin,
+    RegressorMixin,
+    _fit_context,
+)
+from ..utils import check_array, check_random_state
+from ..utils._array_api import (
+    _asarray_with_order,
+    _average,
+    get_namespace,
+    get_namespace_and_device,
+    indexing_dtype,
+    supported_float_dtypes,
+)
+from ..utils._seq_dataset import (
+    ArrayDataset32,
+    ArrayDataset64,
+    CSRDataset32,
+    CSRDataset64,
+)
+from ..utils.extmath import safe_sparse_dot
+from ..utils.parallel import Parallel, delayed
+from ..utils.sparsefuncs import mean_variance_axis
+from ..utils.validation import _check_sample_weight, check_is_fitted
+
+# TODO: bayesian_ridge_regression and bayesian_regression_ard
+# should be squashed into its respective objects.
+
+SPARSE_INTERCEPT_DECAY = 0.01
+# For sparse data intercept updates are scaled by this decay factor to avoid
+# intercept oscillation.
+
+
+def make_dataset(X, y, sample_weight, random_state=None):
+    """Create ``Dataset`` abstraction for sparse and dense inputs.
+
+    This also returns the ``intercept_decay`` which is different
+    for sparse datasets.
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        Training data
+
+    y : array-like, shape (n_samples, )
+        Target values.
+
+    sample_weight : numpy array of shape (n_samples,)
+        The weight of each sample
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset random sampling. It is not
+        used for dataset shuffling.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Returns
+    -------
+    dataset
+        The ``Dataset`` abstraction
+    intercept_decay
+        The intercept decay
+    """
+
+    rng = check_random_state(random_state)
+    # seed should never be 0 in SequentialDataset64
+    seed = rng.randint(1, np.iinfo(np.int32).max)
+
+    if X.dtype == np.float32:
+        CSRData = CSRDataset32
+        ArrayData = ArrayDataset32
+    else:
+        CSRData = CSRDataset64
+        ArrayData = ArrayDataset64
+
+    if sp.issparse(X):
+        dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight, seed=seed)
+        intercept_decay = SPARSE_INTERCEPT_DECAY
+    else:
+        X = np.ascontiguousarray(X)
+        dataset = ArrayData(X, y, sample_weight, seed=seed)
+        intercept_decay = 1.0
+
+    return dataset, intercept_decay
+
+
+def _preprocess_data(
+    X,
+    y,
+    *,
+    fit_intercept,
+    copy=True,
+    copy_y=True,
+    sample_weight=None,
+    check_input=True,
+):
+    """Common data preprocessing for fitting linear models.
+
+    This helper is in charge of the following steps:
+
+    - Ensure that `sample_weight` is an array or `None`.
+    - If `check_input=True`, perform standard input validation of `X`, `y`.
+    - Perform copies if requested to avoid side-effects in case of inplace
+      modifications of the input.
+
+    Then, if `fit_intercept=True` this preprocessing centers both `X` and `y` as
+    follows:
+        - if `X` is dense, center the data and
+        store the mean vector in `X_offset`.
+        - if `X` is sparse, store the mean in `X_offset`
+        without centering `X`. The centering is expected to be handled by the
+        linear solver where appropriate.
+        - in either case, always center `y` and store the mean in `y_offset`.
+        - both `X_offset` and `y_offset` are always weighted by `sample_weight`
+          if not set to `None`.
+
+    If `fit_intercept=False`, no centering is performed and `X_offset`, `y_offset`
+    are set to zero.
+
+    Returns
+    -------
+    X_out : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        If copy=True a copy of the input X is triggered, otherwise operations are
+        inplace.
+        If input X is dense, then X_out is centered.
+    y_out : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)
+        Centered version of y. Possibly performed inplace on input y depending
+        on the copy_y parameter.
+    X_offset : ndarray of shape (n_features,)
+        The mean per column of input X.
+    y_offset : float or ndarray of shape (n_features,)
+    X_scale : ndarray of shape (n_features,)
+        Always an array of ones. TODO: refactor the code base to make it
+        possible to remove this unused variable.
+    """
+    xp, _, device_ = get_namespace_and_device(X, y, sample_weight)
+    n_samples, n_features = X.shape
+    X_is_sparse = sp.issparse(X)
+
+    if isinstance(sample_weight, numbers.Number):
+        sample_weight = None
+    if sample_weight is not None:
+        sample_weight = xp.asarray(sample_weight)
+
+    if check_input:
+        X = check_array(
+            X, copy=copy, accept_sparse=["csr", "csc"], dtype=supported_float_dtypes(xp)
+        )
+        y = check_array(y, dtype=X.dtype, copy=copy_y, ensure_2d=False)
+    else:
+        y = xp.astype(y, X.dtype, copy=copy_y)
+        if copy:
+            if X_is_sparse:
+                X = X.copy()
+            else:
+                X = _asarray_with_order(X, order="K", copy=True, xp=xp)
+
+    dtype_ = X.dtype
+
+    if fit_intercept:
+        if X_is_sparse:
+            X_offset, X_var = mean_variance_axis(X, axis=0, weights=sample_weight)
+        else:
+            X_offset = _average(X, axis=0, weights=sample_weight, xp=xp)
+
+            X_offset = xp.astype(X_offset, X.dtype, copy=False)
+            X -= X_offset
+
+        y_offset = _average(y, axis=0, weights=sample_weight, xp=xp)
+        y -= y_offset
+    else:
+        X_offset = xp.zeros(n_features, dtype=X.dtype, device=device_)
+        if y.ndim == 1:
+            y_offset = xp.asarray(0.0, dtype=dtype_, device=device_)
+        else:
+            y_offset = xp.zeros(y.shape[1], dtype=dtype_, device=device_)
+
+    # XXX: X_scale is no longer needed. It is an historic artifact from the
+    # time where linear model exposed the normalize parameter.
+    X_scale = xp.ones(n_features, dtype=X.dtype, device=device_)
+    return X, y, X_offset, y_offset, X_scale
+
+
+# TODO: _rescale_data should be factored into _preprocess_data.
+# Currently, the fact that sag implements its own way to deal with
+# sample_weight makes the refactoring tricky.
+
+
+def _rescale_data(X, y, sample_weight, inplace=False):
+    """Rescale data sample-wise by square root of sample_weight.
+
+    For many linear models, this enables easy support for sample_weight because
+
+        (y - X w)' S (y - X w)
+
+    with S = diag(sample_weight) becomes
+
+        ||y_rescaled - X_rescaled w||_2^2
+
+    when setting
+
+        y_rescaled = sqrt(S) y
+        X_rescaled = sqrt(S) X
+
+    Returns
+    -------
+    X_rescaled : {array-like, sparse matrix}
+
+    y_rescaled : {array-like, sparse matrix}
+    """
+    # Assume that _validate_data and _check_sample_weight have been called by
+    # the caller.
+    xp, _ = get_namespace(X, y, sample_weight)
+    n_samples = X.shape[0]
+    sample_weight_sqrt = xp.sqrt(sample_weight)
+
+    if sp.issparse(X) or sp.issparse(y):
+        sw_matrix = sparse.dia_matrix(
+            (sample_weight_sqrt, 0), shape=(n_samples, n_samples)
+        )
+
+    if sp.issparse(X):
+        X = safe_sparse_dot(sw_matrix, X)
+    else:
+        if inplace:
+            X *= sample_weight_sqrt[:, None]
+        else:
+            X = X * sample_weight_sqrt[:, None]
+
+    if sp.issparse(y):
+        y = safe_sparse_dot(sw_matrix, y)
+    else:
+        if inplace:
+            if y.ndim == 1:
+                y *= sample_weight_sqrt
+            else:
+                y *= sample_weight_sqrt[:, None]
+        else:
+            if y.ndim == 1:
+                y = y * sample_weight_sqrt
+            else:
+                y = y * sample_weight_sqrt[:, None]
+    return X, y, sample_weight_sqrt
+
+
+class LinearModel(BaseEstimator, metaclass=ABCMeta):
+    """Base class for Linear Models"""
+
+    @abstractmethod
+    def fit(self, X, y):
+        """Fit model."""
+
+    def _decision_function(self, X):
+        check_is_fitted(self)
+
+        X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False)
+        coef_ = self.coef_
+        if coef_.ndim == 1:
+            return X @ coef_ + self.intercept_
+        else:
+            return X @ coef_.T + self.intercept_
+
+    def predict(self, X):
+        """
+        Predict using the linear model.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            Returns predicted values.
+        """
+        return self._decision_function(X)
+
+    def _set_intercept(self, X_offset, y_offset, X_scale):
+        """Set the intercept_"""
+
+        xp, _ = get_namespace(X_offset, y_offset, X_scale)
+
+        if self.fit_intercept:
+            # We always want coef_.dtype=X.dtype. For instance, X.dtype can differ from
+            # coef_.dtype if warm_start=True.
+            coef_ = xp.astype(self.coef_, X_scale.dtype, copy=False)
+            coef_ = self.coef_ = xp.divide(coef_, X_scale)
+
+            if coef_.ndim == 1:
+                intercept_ = y_offset - X_offset @ coef_
+            else:
+                intercept_ = y_offset - X_offset @ coef_.T
+
+            self.intercept_ = intercept_
+
+        else:
+            self.intercept_ = 0.0
+
+    def _more_tags(self):
+        return {"requires_y": True}
+
+
+# XXX Should this derive from LinearModel? It should be a mixin, not an ABC.
+# Maybe the n_features checking can be moved to LinearModel.
+class LinearClassifierMixin(ClassifierMixin):
+    """Mixin for linear classifiers.
+
+    Handles prediction for sparse and dense X.
+    """
+
+    def decision_function(self, X):
+        """
+        Predict confidence scores for samples.
+
+        The confidence score for a sample is proportional to the signed
+        distance of that sample to the hyperplane.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to get the confidence scores.
+
+        Returns
+        -------
+        scores : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Confidence scores per `(n_samples, n_classes)` combination. In the
+            binary case, confidence score for `self.classes_[1]` where >0 means
+            this class would be predicted.
+        """
+        check_is_fitted(self)
+        xp, _ = get_namespace(X)
+
+        X = self._validate_data(X, accept_sparse="csr", reset=False)
+        scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
+        return xp.reshape(scores, (-1,)) if scores.shape[1] == 1 else scores
+
+    def predict(self, X):
+        """
+        Predict class labels for samples in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to get the predictions.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            Vector containing the class labels for each sample.
+        """
+        xp, _ = get_namespace(X)
+        scores = self.decision_function(X)
+        if len(scores.shape) == 1:
+            indices = xp.astype(scores > 0, indexing_dtype(xp))
+        else:
+            indices = xp.argmax(scores, axis=1)
+
+        return xp.take(self.classes_, indices, axis=0)
+
+    def _predict_proba_lr(self, X):
+        """Probability estimation for OvR logistic regression.
+
+        Positive class probabilities are computed as
+        1. / (1. + np.exp(-self.decision_function(X)));
+        multiclass is handled by normalizing that over all classes.
+        """
+        prob = self.decision_function(X)
+        expit(prob, out=prob)
+        if prob.ndim == 1:
+            return np.vstack([1 - prob, prob]).T
+        else:
+            # OvR normalization, like LibLinear's predict_probability
+            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
+            return prob
+
+
+class SparseCoefMixin:
+    """Mixin for converting coef_ to and from CSR format.
+
+    L1-regularizing estimators should inherit this.
+    """
+
+    def densify(self):
+        """
+        Convert coefficient matrix to dense array format.
+
+        Converts the ``coef_`` member (back) to a numpy.ndarray. This is the
+        default format of ``coef_`` and is required for fitting, so calling
+        this method is only required on models that have previously been
+        sparsified; otherwise, it is a no-op.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        msg = "Estimator, %(name)s, must be fitted before densifying."
+        check_is_fitted(self, msg=msg)
+        if sp.issparse(self.coef_):
+            self.coef_ = self.coef_.toarray()
+        return self
+
+    def sparsify(self):
+        """
+        Convert coefficient matrix to sparse format.
+
+        Converts the ``coef_`` member to a scipy.sparse matrix, which for
+        L1-regularized models can be much more memory- and storage-efficient
+        than the usual numpy.ndarray representation.
+
+        The ``intercept_`` member is not converted.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+
+        Notes
+        -----
+        For non-sparse models, i.e. when there are not many zeros in ``coef_``,
+        this may actually *increase* memory usage, so use this method with
+        care. A rule of thumb is that the number of zero elements, which can
+        be computed with ``(coef_ == 0).sum()``, must be more than 50% for this
+        to provide significant benefits.
+
+        After calling this method, further fitting with the partial_fit
+        method (if any) will not work until you call densify.
+        """
+        msg = "Estimator, %(name)s, must be fitted before sparsifying."
+        check_is_fitted(self, msg=msg)
+        self.coef_ = sp.csr_matrix(self.coef_)
+        return self
+
+
+class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
+    """
+    Ordinary least squares Linear Regression.
+
+    LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
+    to minimize the residual sum of squares between the observed targets in
+    the dataset, and the targets predicted by the linear approximation.
+
+    Parameters
+    ----------
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to False, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. This will only provide
+        speedup in case of sufficiently large problems, that is if firstly
+        `n_targets > 1` and secondly `X` is sparse or if `positive` is set
+        to `True`. ``None`` means 1 unless in a
+        :obj:`joblib.parallel_backend` context. ``-1`` means using all
+        processors. See :term:`Glossary <n_jobs>` for more details.
+
+    positive : bool, default=False
+        When set to ``True``, forces the coefficients to be positive. This
+        option is only supported for dense arrays.
+
+        .. versionadded:: 0.24
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features, ) or (n_targets, n_features)
+        Estimated coefficients for the linear regression problem.
+        If multiple targets are passed during the fit (y 2D), this
+        is a 2D array of shape (n_targets, n_features), while if only
+        one target is passed, this is a 1D array of length n_features.
+
+    rank_ : int
+        Rank of matrix `X`. Only available when `X` is dense.
+
+    singular_ : array of shape (min(X, y),)
+        Singular values of `X`. Only available when `X` is dense.
+
+    intercept_ : float or array of shape (n_targets,)
+        Independent term in the linear model. Set to 0.0 if
+        `fit_intercept = False`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    Ridge : Ridge regression addresses some of the
+        problems of Ordinary Least Squares by imposing a penalty on the
+        size of the coefficients with l2 regularization.
+    Lasso : The Lasso is a linear model that estimates
+        sparse coefficients with l1 regularization.
+    ElasticNet : Elastic-Net is a linear regression
+        model trained with both l1 and l2 -norm regularization of the
+        coefficients.
+
+    Notes
+    -----
+    From the implementation point of view, this is just plain Ordinary
+    Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares
+    (scipy.optimize.nnls) wrapped as a predictor object.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LinearRegression
+    >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
+    >>> # y = 1 * x_0 + 2 * x_1 + 3
+    >>> y = np.dot(X, np.array([1, 2])) + 3
+    >>> reg = LinearRegression().fit(X, y)
+    >>> reg.score(X, y)
+    1.0
+    >>> reg.coef_
+    array([1., 2.])
+    >>> reg.intercept_
+    3.0...
+    >>> reg.predict(np.array([[3, 5]]))
+    array([16.])
+    """
+
+    _parameter_constraints: dict = {
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        "n_jobs": [None, Integral],
+        "positive": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        fit_intercept=True,
+        copy_X=True,
+        n_jobs=None,
+        positive=False,
+    ):
+        self.fit_intercept = fit_intercept
+        self.copy_X = copy_X
+        self.n_jobs = n_jobs
+        self.positive = positive
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """
+        Fit linear model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values. Will be cast to X's dtype if necessary.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+            .. versionadded:: 0.17
+               parameter *sample_weight* support to LinearRegression.
+
+        Returns
+        -------
+        self : object
+            Fitted Estimator.
+        """
+        n_jobs_ = self.n_jobs
+
+        accept_sparse = False if self.positive else ["csr", "csc", "coo"]
+
+        X, y = self._validate_data(
+            X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
+        )
+
+        has_sw = sample_weight is not None
+        if has_sw:
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=X.dtype, only_non_negative=True
+            )
+
+        # Note that neither _rescale_data nor the rest of the fit method of
+        # LinearRegression can benefit from in-place operations when X is a
+        # sparse matrix. Therefore, let's not copy X when it is sparse.
+        copy_X_in_preprocess_data = self.copy_X and not sp.issparse(X)
+
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X,
+            y,
+            fit_intercept=self.fit_intercept,
+            copy=copy_X_in_preprocess_data,
+            sample_weight=sample_weight,
+        )
+
+        if has_sw:
+            # Sample weight can be implemented via a simple rescaling. Note
+            # that we safely do inplace rescaling when _preprocess_data has
+            # already made a copy if requested.
+            X, y, sample_weight_sqrt = _rescale_data(
+                X, y, sample_weight, inplace=copy_X_in_preprocess_data
+            )
+
+        if self.positive:
+            if y.ndim < 2:
+                self.coef_ = optimize.nnls(X, y)[0]
+            else:
+                # scipy.optimize.nnls cannot handle y with shape (M, K)
+                outs = Parallel(n_jobs=n_jobs_)(
+                    delayed(optimize.nnls)(X, y[:, j]) for j in range(y.shape[1])
+                )
+                self.coef_ = np.vstack([out[0] for out in outs])
+        elif sp.issparse(X):
+            X_offset_scale = X_offset / X_scale
+
+            if has_sw:
+
+                def matvec(b):
+                    return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale)
+
+                def rmatvec(b):
+                    return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt)
+
+            else:
+
+                def matvec(b):
+                    return X.dot(b) - b.dot(X_offset_scale)
+
+                def rmatvec(b):
+                    return X.T.dot(b) - X_offset_scale * b.sum()
+
+            X_centered = sparse.linalg.LinearOperator(
+                shape=X.shape, matvec=matvec, rmatvec=rmatvec
+            )
+
+            if y.ndim < 2:
+                self.coef_ = lsqr(X_centered, y)[0]
+            else:
+                # sparse_lstsq cannot handle y with shape (M, K)
+                outs = Parallel(n_jobs=n_jobs_)(
+                    delayed(lsqr)(X_centered, y[:, j].ravel())
+                    for j in range(y.shape[1])
+                )
+                self.coef_ = np.vstack([out[0] for out in outs])
+        else:
+            self.coef_, _, self.rank_, self.singular_ = linalg.lstsq(X, y)
+            self.coef_ = self.coef_.T
+
+        if y.ndim == 1:
+            self.coef_ = np.ravel(self.coef_)
+        self._set_intercept(X_offset, y_offset, X_scale)
+        return self
+
+
+def _check_precomputed_gram_matrix(
+    X, precompute, X_offset, X_scale, rtol=None, atol=1e-5
+):
+    """Computes a single element of the gram matrix and compares it to
+    the corresponding element of the user supplied gram matrix.
+
+    If the values do not match a ValueError will be thrown.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Data array.
+
+    precompute : array-like of shape (n_features, n_features)
+        User-supplied gram matrix.
+
+    X_offset : ndarray of shape (n_features,)
+        Array of feature means used to center design matrix.
+
+    X_scale : ndarray of shape (n_features,)
+        Array of feature scale factors used to normalize design matrix.
+
+    rtol : float, default=None
+        Relative tolerance; see numpy.allclose
+        If None, it is set to 1e-4 for arrays of dtype numpy.float32 and 1e-7
+        otherwise.
+
+    atol : float, default=1e-5
+        absolute tolerance; see :func`numpy.allclose`. Note that the default
+        here is more tolerant than the default for
+        :func:`numpy.testing.assert_allclose`, where `atol=0`.
+
+    Raises
+    ------
+    ValueError
+        Raised when the provided Gram matrix is not consistent.
+    """
+
+    n_features = X.shape[1]
+    f1 = n_features // 2
+    f2 = min(f1 + 1, n_features - 1)
+
+    v1 = (X[:, f1] - X_offset[f1]) * X_scale[f1]
+    v2 = (X[:, f2] - X_offset[f2]) * X_scale[f2]
+
+    expected = np.dot(v1, v2)
+    actual = precompute[f1, f2]
+
+    dtypes = [precompute.dtype, expected.dtype]
+    if rtol is None:
+        rtols = [1e-4 if dtype == np.float32 else 1e-7 for dtype in dtypes]
+        rtol = max(rtols)
+
+    if not np.isclose(expected, actual, rtol=rtol, atol=atol):
+        raise ValueError(
+            "Gram matrix passed in via 'precompute' parameter "
+            "did not pass validation when a single element was "
+            "checked - please check that it was computed "
+            f"properly. For element ({f1},{f2}) we computed "
+            f"{expected} but the user-supplied value was "
+            f"{actual}."
+        )
+
+
+def _pre_fit(
+    X,
+    y,
+    Xy,
+    precompute,
+    fit_intercept,
+    copy,
+    check_input=True,
+    sample_weight=None,
+):
+    """Function used at beginning of fit in linear models with L1 or L0 penalty.
+
+    This function applies _preprocess_data and additionally computes the gram matrix
+    `precompute` as needed as well as `Xy`.
+    """
+    n_samples, n_features = X.shape
+
+    if sparse.issparse(X):
+        # copy is not needed here as X is not modified inplace when X is sparse
+        precompute = False
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X,
+            y,
+            fit_intercept=fit_intercept,
+            copy=False,
+            check_input=check_input,
+            sample_weight=sample_weight,
+        )
+    else:
+        # copy was done in fit if necessary
+        X, y, X_offset, y_offset, X_scale = _preprocess_data(
+            X,
+            y,
+            fit_intercept=fit_intercept,
+            copy=copy,
+            check_input=check_input,
+            sample_weight=sample_weight,
+        )
+        # Rescale only in dense case. Sparse cd solver directly deals with
+        # sample_weight.
+        if sample_weight is not None:
+            # This triggers copies anyway.
+            X, y, _ = _rescale_data(X, y, sample_weight=sample_weight)
+
+    if hasattr(precompute, "__array__"):
+        if fit_intercept and not np.allclose(X_offset, np.zeros(n_features)):
+            warnings.warn(
+                (
+                    "Gram matrix was provided but X was centered to fit "
+                    "intercept: recomputing Gram matrix."
+                ),
+                UserWarning,
+            )
+            # TODO: instead of warning and recomputing, we could just center
+            # the user provided Gram matrix a-posteriori (after making a copy
+            # when `copy=True`).
+            # recompute Gram
+            precompute = "auto"
+            Xy = None
+        elif check_input:
+            # If we're going to use the user's precomputed gram matrix, we
+            # do a quick check to make sure its not totally bogus.
+            _check_precomputed_gram_matrix(X, precompute, X_offset, X_scale)
+
+    # precompute if n_samples > n_features
+    if isinstance(precompute, str) and precompute == "auto":
+        precompute = n_samples > n_features
+
+    if precompute is True:
+        # make sure that the 'precompute' array is contiguous.
+        precompute = np.empty(shape=(n_features, n_features), dtype=X.dtype, order="C")
+        np.dot(X.T, X, out=precompute)
+
+    if not hasattr(precompute, "__array__"):
+        Xy = None  # cannot use Xy if precompute is not Gram
+
+    if hasattr(precompute, "__array__") and Xy is None:
+        common_dtype = np.result_type(X.dtype, y.dtype)
+        if y.ndim == 1:
+            # Xy is 1d, make sure it is contiguous.
+            Xy = np.empty(shape=n_features, dtype=common_dtype, order="C")
+            np.dot(X.T, y, out=Xy)
+        else:
+            # Make sure that Xy is always F contiguous even if X or y are not
+            # contiguous: the goal is to make it fast to extract the data for a
+            # specific target.
+            n_targets = y.shape[1]
+            Xy = np.empty(shape=(n_features, n_targets), dtype=common_dtype, order="F")
+            np.dot(y.T, X, out=Xy.T)
+
+    return X, y, X_offset, y_offset, X_scale, precompute, Xy
@@ -0,0 +1,784 @@
+"""
+Various bayesian regression
+"""
+
+# Authors: V. Michel, F. Pedregosa, A. Gramfort
+# License: BSD 3 clause
+
+from math import log
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+from scipy.linalg import pinvh
+
+from ..base import RegressorMixin, _fit_context
+from ..utils import _safe_indexing
+from ..utils._param_validation import Interval
+from ..utils.extmath import fast_logdet
+from ..utils.validation import _check_sample_weight
+from ._base import LinearModel, _preprocess_data, _rescale_data
+
+###############################################################################
+# BayesianRidge regression
+
+
+class BayesianRidge(RegressorMixin, LinearModel):
+    """Bayesian ridge regression.
+
+    Fit a Bayesian ridge model. See the Notes section for details on this
+    implementation and the optimization of the regularization parameters
+    lambda (precision of the weights) and alpha (precision of the noise).
+
+    Read more in the :ref:`User Guide <bayesian_regression>`.
+    For an intuitive visualization of how the sinusoid is approximated by
+    a polynomial using different pairs of initial values, see
+    :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`.
+
+    Parameters
+    ----------
+    max_iter : int, default=300
+        Maximum number of iterations over the complete dataset before
+        stopping independently of any early stopping criterion.
+
+        .. versionchanged:: 1.3
+
+    tol : float, default=1e-3
+        Stop the algorithm if w has converged.
+
+    alpha_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the alpha parameter.
+
+    alpha_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the alpha parameter.
+
+    lambda_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the lambda parameter.
+
+    lambda_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the lambda parameter.
+
+    alpha_init : float, default=None
+        Initial value for alpha (precision of the noise).
+        If not set, alpha_init is 1/Var(y).
+
+            .. versionadded:: 0.22
+
+    lambda_init : float, default=None
+        Initial value for lambda (precision of the weights).
+        If not set, lambda_init is 1.
+
+            .. versionadded:: 0.22
+
+    compute_score : bool, default=False
+        If True, compute the log marginal likelihood at each iteration of the
+        optimization.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model.
+        The intercept is not treated as a probabilistic parameter
+        and thus has no associated variance. If set
+        to False, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    verbose : bool, default=False
+        Verbose mode when fitting the model.
+
+    Attributes
+    ----------
+    coef_ : array-like of shape (n_features,)
+        Coefficients of the regression model (mean of distribution)
+
+    intercept_ : float
+        Independent term in decision function. Set to 0.0 if
+        `fit_intercept = False`.
+
+    alpha_ : float
+       Estimated precision of the noise.
+
+    lambda_ : float
+       Estimated precision of the weights.
+
+    sigma_ : array-like of shape (n_features, n_features)
+        Estimated variance-covariance matrix of the weights
+
+    scores_ : array-like of shape (n_iter_+1,)
+        If computed_score is True, value of the log marginal likelihood (to be
+        maximized) at each iteration of the optimization. The array starts
+        with the value of the log marginal likelihood obtained for the initial
+        values of alpha and lambda and ends with the value obtained for the
+        estimated alpha and lambda.
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+    X_offset_ : ndarray of shape (n_features,)
+        If `fit_intercept=True`, offset subtracted for centering data to a
+        zero mean. Set to np.zeros(n_features) otherwise.
+
+    X_scale_ : ndarray of shape (n_features,)
+        Set to np.ones(n_features).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    ARDRegression : Bayesian ARD regression.
+
+    Notes
+    -----
+    There exist several strategies to perform Bayesian ridge regression. This
+    implementation is based on the algorithm described in Appendix A of
+    (Tipping, 2001) where updates of the regularization parameters are done as
+    suggested in (MacKay, 1992). Note that according to A New
+    View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
+    update rules do not guarantee that the marginal likelihood is increasing
+    between two consecutive iterations of the optimization.
+
+    References
+    ----------
+    D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
+    Vol. 4, No. 3, 1992.
+
+    M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
+    Journal of Machine Learning Research, Vol. 1, 2001.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.BayesianRidge()
+    >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
+    BayesianRidge()
+    >>> clf.predict([[1, 1]])
+    array([1.])
+    """
+
+    _parameter_constraints: dict = {
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="neither")],
+        "alpha_1": [Interval(Real, 0, None, closed="left")],
+        "alpha_2": [Interval(Real, 0, None, closed="left")],
+        "lambda_1": [Interval(Real, 0, None, closed="left")],
+        "lambda_2": [Interval(Real, 0, None, closed="left")],
+        "alpha_init": [None, Interval(Real, 0, None, closed="left")],
+        "lambda_init": [None, Interval(Real, 0, None, closed="left")],
+        "compute_score": ["boolean"],
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        max_iter=300,
+        tol=1.0e-3,
+        alpha_1=1.0e-6,
+        alpha_2=1.0e-6,
+        lambda_1=1.0e-6,
+        lambda_2=1.0e-6,
+        alpha_init=None,
+        lambda_init=None,
+        compute_score=False,
+        fit_intercept=True,
+        copy_X=True,
+        verbose=False,
+    ):
+        self.max_iter = max_iter
+        self.tol = tol
+        self.alpha_1 = alpha_1
+        self.alpha_2 = alpha_2
+        self.lambda_1 = lambda_1
+        self.lambda_2 = lambda_2
+        self.alpha_init = alpha_init
+        self.lambda_init = lambda_init
+        self.compute_score = compute_score
+        self.fit_intercept = fit_intercept
+        self.copy_X = copy_X
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+        y : ndarray of shape (n_samples,)
+            Target values. Will be cast to X's dtype if necessary.
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+            .. versionadded:: 0.20
+               parameter *sample_weight* support to BayesianRidge.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True)
+        dtype = X.dtype
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=dtype)
+
+        X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
+            X,
+            y,
+            fit_intercept=self.fit_intercept,
+            copy=self.copy_X,
+            sample_weight=sample_weight,
+        )
+
+        if sample_weight is not None:
+            # Sample weight can be implemented via a simple rescaling.
+            X, y, _ = _rescale_data(X, y, sample_weight)
+
+        self.X_offset_ = X_offset_
+        self.X_scale_ = X_scale_
+        n_samples, n_features = X.shape
+
+        # Initialization of the values of the parameters
+        eps = np.finfo(np.float64).eps
+        # Add `eps` in the denominator to omit division by zero if `np.var(y)`
+        # is zero
+        alpha_ = self.alpha_init
+        lambda_ = self.lambda_init
+        if alpha_ is None:
+            alpha_ = 1.0 / (np.var(y) + eps)
+        if lambda_ is None:
+            lambda_ = 1.0
+
+        # Avoid unintended type promotion to float64 with numpy 2
+        alpha_ = np.asarray(alpha_, dtype=dtype)
+        lambda_ = np.asarray(lambda_, dtype=dtype)
+
+        verbose = self.verbose
+        lambda_1 = self.lambda_1
+        lambda_2 = self.lambda_2
+        alpha_1 = self.alpha_1
+        alpha_2 = self.alpha_2
+
+        self.scores_ = list()
+        coef_old_ = None
+
+        XT_y = np.dot(X.T, y)
+        U, S, Vh = linalg.svd(X, full_matrices=False)
+        eigen_vals_ = S**2
+
+        # Convergence loop of the bayesian ridge regression
+        for iter_ in range(self.max_iter):
+            # update posterior mean coef_ based on alpha_ and lambda_ and
+            # compute corresponding rmse
+            coef_, rmse_ = self._update_coef_(
+                X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
+            )
+            if self.compute_score:
+                # compute the log marginal likelihood
+                s = self._log_marginal_likelihood(
+                    n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
+                )
+                self.scores_.append(s)
+
+            # Update alpha and lambda according to (MacKay, 1992)
+            gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
+            lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
+            alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)
+
+            # Check for convergence
+            if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
+                if verbose:
+                    print("Convergence after ", str(iter_), " iterations")
+                break
+            coef_old_ = np.copy(coef_)
+
+        self.n_iter_ = iter_ + 1
+
+        # return regularization parameters and corresponding posterior mean,
+        # log marginal likelihood and posterior covariance
+        self.alpha_ = alpha_
+        self.lambda_ = lambda_
+        self.coef_, rmse_ = self._update_coef_(
+            X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
+        )
+        if self.compute_score:
+            # compute the log marginal likelihood
+            s = self._log_marginal_likelihood(
+                n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
+            )
+            self.scores_.append(s)
+            self.scores_ = np.array(self.scores_)
+
+        # posterior covariance is given by 1/alpha_ * scaled_sigma_
+        scaled_sigma_ = np.dot(
+            Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis]
+        )
+        self.sigma_ = (1.0 / alpha_) * scaled_sigma_
+
+        self._set_intercept(X_offset_, y_offset_, X_scale_)
+
+        return self
+
+    def predict(self, X, return_std=False):
+        """Predict using the linear model.
+
+        In addition to the mean of the predictive distribution, also its
+        standard deviation can be returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        return_std : bool, default=False
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array-like of shape (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array-like of shape (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if not return_std:
+            return y_mean
+        else:
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
+            return y_mean, y_std
+
+    def _update_coef_(
+        self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
+    ):
+        """Update posterior mean and compute corresponding rmse.
+
+        Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
+        scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
+                         + np.dot(X.T, X))^-1
+        """
+
+        if n_samples > n_features:
+            coef_ = np.linalg.multi_dot(
+                [Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y]
+            )
+        else:
+            coef_ = np.linalg.multi_dot(
+                [X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
+            )
+
+        rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
+
+        return coef_, rmse_
+
+    def _log_marginal_likelihood(
+        self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse
+    ):
+        """Log marginal likelihood."""
+        alpha_1 = self.alpha_1
+        alpha_2 = self.alpha_2
+        lambda_1 = self.lambda_1
+        lambda_2 = self.lambda_2
+
+        # compute the log of the determinant of the posterior covariance.
+        # posterior covariance is given by
+        # sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
+        if n_samples > n_features:
+            logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))
+        else:
+            logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)
+            logdet_sigma[:n_samples] += alpha_ * eigen_vals
+            logdet_sigma = -np.sum(np.log(logdet_sigma))
+
+        score = lambda_1 * log(lambda_) - lambda_2 * lambda_
+        score += alpha_1 * log(alpha_) - alpha_2 * alpha_
+        score += 0.5 * (
+            n_features * log(lambda_)
+            + n_samples * log(alpha_)
+            - alpha_ * rmse
+            - lambda_ * np.sum(coef**2)
+            + logdet_sigma
+            - n_samples * log(2 * np.pi)
+        )
+
+        return score
+
+
+###############################################################################
+# ARD (Automatic Relevance Determination) regression
+
+
+class ARDRegression(RegressorMixin, LinearModel):
+    """Bayesian ARD regression.
+
+    Fit the weights of a regression model, using an ARD prior. The weights of
+    the regression model are assumed to be in Gaussian distributions.
+    Also estimate the parameters lambda (precisions of the distributions of the
+    weights) and alpha (precision of the distribution of the noise).
+    The estimation is done by an iterative procedures (Evidence Maximization)
+
+    Read more in the :ref:`User Guide <bayesian_regression>`.
+
+    Parameters
+    ----------
+    max_iter : int, default=300
+        Maximum number of iterations.
+
+        .. versionchanged:: 1.3
+
+    tol : float, default=1e-3
+        Stop the algorithm if w has converged.
+
+    alpha_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the alpha parameter.
+
+    alpha_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the alpha parameter.
+
+    lambda_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the lambda parameter.
+
+    lambda_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the lambda parameter.
+
+    compute_score : bool, default=False
+        If True, compute the objective function at each step of the model.
+
+    threshold_lambda : float, default=10 000
+        Threshold for removing (pruning) weights with high precision from
+        the computation.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    verbose : bool, default=False
+        Verbose mode when fitting the model.
+
+    Attributes
+    ----------
+    coef_ : array-like of shape (n_features,)
+        Coefficients of the regression model (mean of distribution)
+
+    alpha_ : float
+       estimated precision of the noise.
+
+    lambda_ : array-like of shape (n_features,)
+       estimated precisions of the weights.
+
+    sigma_ : array-like of shape (n_features, n_features)
+        estimated variance-covariance matrix of the weights
+
+    scores_ : float
+        if computed, value of the objective function (to be maximized)
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+        .. versionadded:: 1.3
+
+    intercept_ : float
+        Independent term in decision function. Set to 0.0 if
+        ``fit_intercept = False``.
+
+    X_offset_ : float
+        If `fit_intercept=True`, offset subtracted for centering data to a
+        zero mean. Set to np.zeros(n_features) otherwise.
+
+    X_scale_ : float
+        Set to np.ones(n_features).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    BayesianRidge : Bayesian ridge regression.
+
+    Notes
+    -----
+    For an example, see :ref:`examples/linear_model/plot_ard.py
+    <sphx_glr_auto_examples_linear_model_plot_ard.py>`.
+
+    References
+    ----------
+    D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
+    competition, ASHRAE Transactions, 1994.
+
+    R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
+    http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
+    Their beta is our ``self.alpha_``
+    Their alpha is our ``self.lambda_``
+    ARD is a little different than the slide: only dimensions/features for
+    which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
+    discarded.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.ARDRegression()
+    >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
+    ARDRegression()
+    >>> clf.predict([[1, 1]])
+    array([1.])
+    """
+
+    _parameter_constraints: dict = {
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "alpha_1": [Interval(Real, 0, None, closed="left")],
+        "alpha_2": [Interval(Real, 0, None, closed="left")],
+        "lambda_1": [Interval(Real, 0, None, closed="left")],
+        "lambda_2": [Interval(Real, 0, None, closed="left")],
+        "compute_score": ["boolean"],
+        "threshold_lambda": [Interval(Real, 0, None, closed="left")],
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        max_iter=300,
+        tol=1.0e-3,
+        alpha_1=1.0e-6,
+        alpha_2=1.0e-6,
+        lambda_1=1.0e-6,
+        lambda_2=1.0e-6,
+        compute_score=False,
+        threshold_lambda=1.0e4,
+        fit_intercept=True,
+        copy_X=True,
+        verbose=False,
+    ):
+        self.max_iter = max_iter
+        self.tol = tol
+        self.fit_intercept = fit_intercept
+        self.alpha_1 = alpha_1
+        self.alpha_2 = alpha_2
+        self.lambda_1 = lambda_1
+        self.lambda_2 = lambda_2
+        self.compute_score = compute_score
+        self.threshold_lambda = threshold_lambda
+        self.copy_X = copy_X
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit the model according to the given training data and parameters.
+
+        Iterative procedure to maximize the evidence
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+        y : array-like of shape (n_samples,)
+            Target values (integers). Will be cast to X's dtype if necessary.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        X, y = self._validate_data(
+            X, y, dtype=[np.float64, np.float32], y_numeric=True, ensure_min_samples=2
+        )
+        dtype = X.dtype
+
+        n_samples, n_features = X.shape
+        coef_ = np.zeros(n_features, dtype=dtype)
+
+        X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
+            X, y, fit_intercept=self.fit_intercept, copy=self.copy_X
+        )
+
+        self.X_offset_ = X_offset_
+        self.X_scale_ = X_scale_
+
+        # Launch the convergence loop
+        keep_lambda = np.ones(n_features, dtype=bool)
+
+        lambda_1 = self.lambda_1
+        lambda_2 = self.lambda_2
+        alpha_1 = self.alpha_1
+        alpha_2 = self.alpha_2
+        verbose = self.verbose
+
+        # Initialization of the values of the parameters
+        eps = np.finfo(np.float64).eps
+        # Add `eps` in the denominator to omit division by zero if `np.var(y)`
+        # is zero.
+        # Explicitly set dtype to avoid unintended type promotion with numpy 2.
+        alpha_ = np.asarray(1.0 / (np.var(y) + eps), dtype=dtype)
+        lambda_ = np.ones(n_features, dtype=dtype)
+
+        self.scores_ = list()
+        coef_old_ = None
+
+        def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
+            coef_[keep_lambda] = alpha_ * np.linalg.multi_dot(
+                [sigma_, X[:, keep_lambda].T, y]
+            )
+            return coef_
+
+        update_sigma = (
+            self._update_sigma
+            if n_samples >= n_features
+            else self._update_sigma_woodbury
+        )
+        # Iterative procedure of ARDRegression
+        for iter_ in range(self.max_iter):
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
+            coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+
+            # Update alpha and lambda
+            rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
+            gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
+            lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
+                (coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
+            )
+            alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (
+                rmse_ + 2.0 * alpha_2
+            )
+
+            # Prune the weights with a precision over a threshold
+            keep_lambda = lambda_ < self.threshold_lambda
+            coef_[~keep_lambda] = 0
+
+            # Compute the objective function
+            if self.compute_score:
+                s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
+                s += alpha_1 * log(alpha_) - alpha_2 * alpha_
+                s += 0.5 * (
+                    fast_logdet(sigma_)
+                    + n_samples * log(alpha_)
+                    + np.sum(np.log(lambda_))
+                )
+                s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())
+                self.scores_.append(s)
+
+            # Check for convergence
+            if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
+                if verbose:
+                    print("Converged after %s iterations" % iter_)
+                break
+            coef_old_ = np.copy(coef_)
+
+            if not keep_lambda.any():
+                break
+
+        self.n_iter_ = iter_ + 1
+
+        if keep_lambda.any():
+            # update sigma and mu using updated params from the last iteration
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
+            coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+        else:
+            sigma_ = np.array([]).reshape(0, 0)
+
+        self.coef_ = coef_
+        self.alpha_ = alpha_
+        self.sigma_ = sigma_
+        self.lambda_ = lambda_
+        self._set_intercept(X_offset_, y_offset_, X_scale_)
+        return self
+
+    def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples < n_features and will invert
+        # a matrix of shape (n_samples, n_samples) making use of the
+        # woodbury formula:
+        # https://en.wikipedia.org/wiki/Woodbury_matrix_identity
+        n_samples = X.shape[0]
+        X_keep = X[:, keep_lambda]
+        inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
+        sigma_ = pinvh(
+            np.eye(n_samples, dtype=X.dtype) / alpha_
+            + np.dot(X_keep * inv_lambda, X_keep.T)
+        )
+        sigma_ = np.dot(sigma_, X_keep * inv_lambda)
+        sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
+        sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]
+        return sigma_
+
+    def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples >= n_features and will
+        # invert a matrix of shape (n_features, n_features)
+        X_keep = X[:, keep_lambda]
+        gram = np.dot(X_keep.T, X_keep)
+        eye = np.eye(gram.shape[0], dtype=X.dtype)
+        sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
+        sigma_ = pinvh(sigma_inv)
+        return sigma_
+
+    def predict(self, X, return_std=False):
+        """Predict using the linear model.
+
+        In addition to the mean of the predictive distribution, also its
+        standard deviation can be returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        return_std : bool, default=False
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array-like of shape (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array-like of shape (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if return_std is False:
+            return y_mean
+        else:
+            col_index = self.lambda_ < self.threshold_lambda
+            X = _safe_indexing(X, indices=col_index, axis=1)
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
+            return y_mean, y_std
@@ -0,0 +1,961 @@
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#         Fabian Pedregosa <fabian.pedregosa@inria.fr>
+#         Olivier Grisel <olivier.grisel@ensta.org>
+#         Alexis Mignon <alexis.mignon@gmail.com>
+#         Manoj Kumar <manojkumarsivaraj334@gmail.com>
+#
+# License: BSD 3 clause
+
+from libc.math cimport fabs
+import numpy as np
+
+from cython cimport floating
+import warnings
+from ..exceptions import ConvergenceWarning
+
+from ..utils._cython_blas cimport (
+    _axpy, _dot, _asum, _gemv, _nrm2, _copy, _scal
+)
+from ..utils._cython_blas cimport ColMajor, Trans, NoTrans
+from ..utils._typedefs cimport uint32_t
+from ..utils._random cimport our_rand_r
+
+
+# The following two functions are shamelessly copied from the tree code.
+
+cdef enum:
+    # Max value for our rand_r replacement (near the bottom).
+    # We don't use RAND_MAX because it's different across platforms and
+    # particularly tiny on Windows/MSVC.
+    # It corresponds to the maximum representable value for
+    # 32-bit signed integers (i.e. 2^31 - 1).
+    RAND_R_MAX = 2147483647
+
+
+cdef inline uint32_t rand_int(uint32_t end, uint32_t* random_state) noexcept nogil:
+    """Generate a random integer in [0; end)."""
+    return our_rand_r(random_state) % end
+
+
+cdef inline floating fmax(floating x, floating y) noexcept nogil:
+    if x > y:
+        return x
+    return y
+
+
+cdef inline floating fsign(floating f) noexcept nogil:
+    if f == 0:
+        return 0
+    elif f > 0:
+        return 1.0
+    else:
+        return -1.0
+
+
+cdef floating abs_max(int n, const floating* a) noexcept nogil:
+    """np.max(np.abs(a))"""
+    cdef int i
+    cdef floating m = fabs(a[0])
+    cdef floating d
+    for i in range(1, n):
+        d = fabs(a[i])
+        if d > m:
+            m = d
+    return m
+
+
+cdef floating max(int n, floating* a) noexcept nogil:
+    """np.max(a)"""
+    cdef int i
+    cdef floating m = a[0]
+    cdef floating d
+    for i in range(1, n):
+        d = a[i]
+        if d > m:
+            m = d
+    return m
+
+
+cdef floating diff_abs_max(int n, const floating* a, floating* b) noexcept nogil:
+    """np.max(np.abs(a - b))"""
+    cdef int i
+    cdef floating m = fabs(a[0] - b[0])
+    cdef floating d
+    for i in range(1, n):
+        d = fabs(a[i] - b[i])
+        if d > m:
+            m = d
+    return m
+
+
+def enet_coordinate_descent(
+    floating[::1] w,
+    floating alpha,
+    floating beta,
+    const floating[::1, :] X,
+    const floating[::1] y,
+    unsigned int max_iter,
+    floating tol,
+    object rng,
+    bint random=0,
+    bint positive=0
+):
+    """Cython version of the coordinate descent algorithm
+        for Elastic-Net regression
+
+        We minimize
+
+        (1/2) * norm(y - X w, 2)^2 + alpha norm(w, 1) + (beta/2) norm(w, 2)^2
+
+    Returns
+    -------
+    w : ndarray of shape (n_features,)
+        ElasticNet coefficients.
+    gap : float
+        Achieved dual gap.
+    tol : float
+        Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
+    n_iter : int
+        Number of coordinate descent iterations.
+    """
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    # get the data information into easy vars
+    cdef unsigned int n_samples = X.shape[0]
+    cdef unsigned int n_features = X.shape[1]
+
+    # compute norms of the columns of X
+    cdef floating[::1] norm_cols_X = np.square(X).sum(axis=0)
+
+    # initial value of the residuals
+    cdef floating[::1] R = np.empty(n_samples, dtype=dtype)
+    cdef floating[::1] XtA = np.empty(n_features, dtype=dtype)
+
+    cdef floating tmp
+    cdef floating w_ii
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating dual_norm_XtA
+    cdef floating R_norm2
+    cdef floating w_norm2
+    cdef floating l1_norm
+    cdef floating const
+    cdef floating A_norm2
+    cdef unsigned int ii
+    cdef unsigned int n_iter = 0
+    cdef unsigned int f_iter
+    cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef uint32_t* rand_r_state = &rand_r_state_seed
+
+    if alpha == 0 and beta == 0:
+        warnings.warn("Coordinate descent with no regularization may lead to "
+                      "unexpected results and is discouraged.")
+
+    with nogil:
+        # R = y - np.dot(X, w)
+        _copy(n_samples, &y[0], 1, &R[0], 1)
+        _gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0],
+              n_samples, &w[0], 1, 1.0, &R[0], 1)
+
+        # tol *= np.dot(y, y)
+        tol *= _dot(n_samples, &y[0], 1, &y[0], 1)
+
+        for n_iter in range(max_iter):
+            w_max = 0.0
+            d_w_max = 0.0
+            for f_iter in range(n_features):  # Loop over coordinates
+                if random:
+                    ii = rand_int(n_features, rand_r_state)
+                else:
+                    ii = f_iter
+
+                if norm_cols_X[ii] == 0.0:
+                    continue
+
+                w_ii = w[ii]  # Store previous value
+
+                if w_ii != 0.0:
+                    # R += w_ii * X[:,ii]
+                    _axpy(n_samples, w_ii, &X[0, ii], 1, &R[0], 1)
+
+                # tmp = (X[:,ii]*R).sum()
+                tmp = _dot(n_samples, &X[0, ii], 1, &R[0], 1)
+
+                if positive and tmp < 0:
+                    w[ii] = 0.0
+                else:
+                    w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
+                             / (norm_cols_X[ii] + beta))
+
+                if w[ii] != 0.0:
+                    # R -=  w[ii] * X[:,ii] # Update residual
+                    _axpy(n_samples, -w[ii], &X[0, ii], 1, &R[0], 1)
+
+                # update the maximum absolute coefficient update
+                d_w_ii = fabs(w[ii] - w_ii)
+                d_w_max = fmax(d_w_max, d_w_ii)
+
+                w_max = fmax(w_max, fabs(w[ii]))
+
+            if (
+                w_max == 0.0
+                or d_w_max / w_max < d_w_tol
+                or n_iter == max_iter - 1
+            ):
+                # the biggest coordinate update of this iteration was smaller
+                # than the tolerance: check the duality gap as ultimate
+                # stopping criterion
+
+                # XtA = np.dot(X.T, R) - beta * w
+                _copy(n_features, &w[0], 1, &XtA[0], 1)
+                _gemv(ColMajor, Trans,
+                      n_samples, n_features, 1.0, &X[0, 0], n_samples,
+                      &R[0], 1,
+                      -beta, &XtA[0], 1)
+
+                if positive:
+                    dual_norm_XtA = max(n_features, &XtA[0])
+                else:
+                    dual_norm_XtA = abs_max(n_features, &XtA[0])
+
+                # R_norm2 = np.dot(R, R)
+                R_norm2 = _dot(n_samples, &R[0], 1, &R[0], 1)
+
+                # w_norm2 = np.dot(w, w)
+                w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1)
+
+                if (dual_norm_XtA > alpha):
+                    const = alpha / dual_norm_XtA
+                    A_norm2 = R_norm2 * (const ** 2)
+                    gap = 0.5 * (R_norm2 + A_norm2)
+                else:
+                    const = 1.0
+                    gap = R_norm2
+
+                l1_norm = _asum(n_features, &w[0], 1)
+
+                # np.dot(R.T, y)
+                gap += (alpha * l1_norm
+                        - const * _dot(n_samples, &R[0], 1, &y[0], 1)
+                        + 0.5 * beta * (1 + const ** 2) * (w_norm2))
+
+                if gap < tol:
+                    # return if we reached desired tolerance
+                    break
+
+        else:
+            # for/else, runs if for doesn't end with a `break`
+            with gil:
+                message = (
+                    "Objective did not converge. You might want to increase "
+                    "the number of iterations, check the scale of the "
+                    "features or consider increasing regularisation. "
+                    f"Duality gap: {gap:.3e}, tolerance: {tol:.3e}"
+                )
+                if alpha < np.finfo(np.float64).eps:
+                    message += (
+                        " Linear regression models with null weight for the "
+                        "l1 regularization term are more efficiently fitted "
+                        "using one of the solvers implemented in "
+                        "sklearn.linear_model.Ridge/RidgeCV instead."
+                    )
+                warnings.warn(message, ConvergenceWarning)
+
+    return np.asarray(w), gap, tol, n_iter + 1
+
+
+def sparse_enet_coordinate_descent(
+    floating[::1] w,
+    floating alpha,
+    floating beta,
+    const floating[::1] X_data,
+    const int[::1] X_indices,
+    const int[::1] X_indptr,
+    const floating[::1] y,
+    const floating[::1] sample_weight,
+    const floating[::1] X_mean,
+    unsigned int max_iter,
+    floating tol,
+    object rng,
+    bint random=0,
+    bint positive=0,
+):
+    """Cython version of the coordinate descent algorithm for Elastic-Net
+
+    We minimize:
+
+        1/2 * norm(y - Z w, 2)^2 + alpha * norm(w, 1) + (beta/2) * norm(w, 2)^2
+
+    where Z = X - X_mean.
+    With sample weights sw, this becomes
+
+        1/2 * sum(sw * (y - Z w)^2, axis=0) + alpha * norm(w, 1)
+        + (beta/2) * norm(w, 2)^2
+
+    and X_mean is the weighted average of X (per column).
+
+    Returns
+    -------
+    w : ndarray of shape (n_features,)
+        ElasticNet coefficients.
+    gap : float
+        Achieved dual gap.
+    tol : float
+        Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
+    n_iter : int
+        Number of coordinate descent iterations.
+    """
+    # Notes for sample_weight:
+    # For dense X, one centers X and y and then rescales them by sqrt(sample_weight).
+    # Here, for sparse X, we get the sample_weight averaged center X_mean. We take care
+    # that every calculation results as if we had rescaled y and X (and therefore also
+    # X_mean) by sqrt(sample_weight) without actually calculating the square root.
+    # We work with:
+    #     yw = sample_weight
+    #     R = sample_weight * residual
+    #     norm_cols_X = np.sum(sample_weight * (X - X_mean)**2, axis=0)
+
+    # get the data information into easy vars
+    cdef unsigned int n_samples = y.shape[0]
+    cdef unsigned int n_features = w.shape[0]
+
+    # compute norms of the columns of X
+    cdef unsigned int ii
+    cdef floating[:] norm_cols_X
+
+    cdef unsigned int startptr = X_indptr[0]
+    cdef unsigned int endptr
+
+    # initial value of the residuals
+    # R = y - Zw, weighted version R = sample_weight * (y - Zw)
+    cdef floating[::1] R
+    cdef floating[::1] XtA
+    cdef const floating[::1] yw
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    norm_cols_X = np.zeros(n_features, dtype=dtype)
+    XtA = np.zeros(n_features, dtype=dtype)
+
+    cdef floating tmp
+    cdef floating w_ii
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating X_mean_ii
+    cdef floating R_sum = 0.0
+    cdef floating R_norm2
+    cdef floating w_norm2
+    cdef floating A_norm2
+    cdef floating l1_norm
+    cdef floating normalize_sum
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating dual_norm_XtA
+    cdef unsigned int jj
+    cdef unsigned int n_iter = 0
+    cdef unsigned int f_iter
+    cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef uint32_t* rand_r_state = &rand_r_state_seed
+    cdef bint center = False
+    cdef bint no_sample_weights = sample_weight is None
+    cdef int kk
+
+    if no_sample_weights:
+        yw = y
+        R = y.copy()
+    else:
+        yw = np.multiply(sample_weight, y)
+        R = yw.copy()
+
+    with nogil:
+        # center = (X_mean != 0).any()
+        for ii in range(n_features):
+            if X_mean[ii]:
+                center = True
+                break
+
+        for ii in range(n_features):
+            X_mean_ii = X_mean[ii]
+            endptr = X_indptr[ii + 1]
+            normalize_sum = 0.0
+            w_ii = w[ii]
+
+            if no_sample_weights:
+                for jj in range(startptr, endptr):
+                    normalize_sum += (X_data[jj] - X_mean_ii) ** 2
+                    R[X_indices[jj]] -= X_data[jj] * w_ii
+                norm_cols_X[ii] = normalize_sum + \
+                    (n_samples - endptr + startptr) * X_mean_ii ** 2
+                if center:
+                    for jj in range(n_samples):
+                        R[jj] += X_mean_ii * w_ii
+            else:
+                for jj in range(startptr, endptr):
+                    tmp = sample_weight[X_indices[jj]]
+                    # second term will be subtracted by loop over range(n_samples)
+                    normalize_sum += (tmp * (X_data[jj] - X_mean_ii) ** 2
+                                      - tmp * X_mean_ii ** 2)
+                    R[X_indices[jj]] -= tmp * X_data[jj] * w_ii
+                if center:
+                    for jj in range(n_samples):
+                        normalize_sum += sample_weight[jj] * X_mean_ii ** 2
+                        R[jj] += sample_weight[jj] * X_mean_ii * w_ii
+                norm_cols_X[ii] = normalize_sum
+            startptr = endptr
+
+        # tol *= np.dot(y, y)
+        # with sample weights: tol *= y @ (sw * y)
+        tol *= _dot(n_samples, &y[0], 1, &yw[0], 1)
+
+        for n_iter in range(max_iter):
+
+            w_max = 0.0
+            d_w_max = 0.0
+
+            for f_iter in range(n_features):  # Loop over coordinates
+                if random:
+                    ii = rand_int(n_features, rand_r_state)
+                else:
+                    ii = f_iter
+
+                if norm_cols_X[ii] == 0.0:
+                    continue
+
+                startptr = X_indptr[ii]
+                endptr = X_indptr[ii + 1]
+                w_ii = w[ii]  # Store previous value
+                X_mean_ii = X_mean[ii]
+
+                if w_ii != 0.0:
+                    # R += w_ii * X[:,ii]
+                    if no_sample_weights:
+                        for jj in range(startptr, endptr):
+                            R[X_indices[jj]] += X_data[jj] * w_ii
+                        if center:
+                            for jj in range(n_samples):
+                                R[jj] -= X_mean_ii * w_ii
+                    else:
+                        for jj in range(startptr, endptr):
+                            tmp = sample_weight[X_indices[jj]]
+                            R[X_indices[jj]] += tmp * X_data[jj] * w_ii
+                        if center:
+                            for jj in range(n_samples):
+                                R[jj] -= sample_weight[jj] * X_mean_ii * w_ii
+
+                # tmp = (X[:,ii] * R).sum()
+                tmp = 0.0
+                for jj in range(startptr, endptr):
+                    tmp += R[X_indices[jj]] * X_data[jj]
+
+                if center:
+                    R_sum = 0.0
+                    for jj in range(n_samples):
+                        R_sum += R[jj]
+                    tmp -= R_sum * X_mean_ii
+
+                if positive and tmp < 0.0:
+                    w[ii] = 0.0
+                else:
+                    w[ii] = fsign(tmp) * fmax(fabs(tmp) - alpha, 0) \
+                            / (norm_cols_X[ii] + beta)
+
+                if w[ii] != 0.0:
+                    # R -=  w[ii] * X[:,ii] # Update residual
+                    if no_sample_weights:
+                        for jj in range(startptr, endptr):
+                            R[X_indices[jj]] -= X_data[jj] * w[ii]
+                        if center:
+                            for jj in range(n_samples):
+                                R[jj] += X_mean_ii * w[ii]
+                    else:
+                        for jj in range(startptr, endptr):
+                            tmp = sample_weight[X_indices[jj]]
+                            R[X_indices[jj]] -= tmp * X_data[jj] * w[ii]
+                        if center:
+                            for jj in range(n_samples):
+                                R[jj] += sample_weight[jj] * X_mean_ii * w[ii]
+
+                # update the maximum absolute coefficient update
+                d_w_ii = fabs(w[ii] - w_ii)
+                d_w_max = fmax(d_w_max, d_w_ii)
+
+                w_max = fmax(w_max, fabs(w[ii]))
+
+            if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
+                # the biggest coordinate update of this iteration was smaller than
+                # the tolerance: check the duality gap as ultimate stopping
+                # criterion
+
+                # sparse X.T / dense R dot product
+                if center:
+                    R_sum = 0.0
+                    for jj in range(n_samples):
+                        R_sum += R[jj]
+
+                # XtA = X.T @ R - beta * w
+                for ii in range(n_features):
+                    XtA[ii] = 0.0
+                    for kk in range(X_indptr[ii], X_indptr[ii + 1]):
+                        XtA[ii] += X_data[kk] * R[X_indices[kk]]
+
+                    if center:
+                        XtA[ii] -= X_mean[ii] * R_sum
+                    XtA[ii] -= beta * w[ii]
+
+                if positive:
+                    dual_norm_XtA = max(n_features, &XtA[0])
+                else:
+                    dual_norm_XtA = abs_max(n_features, &XtA[0])
+
+                # R_norm2 = np.dot(R, R)
+                if no_sample_weights:
+                    R_norm2 = _dot(n_samples, &R[0], 1, &R[0], 1)
+                else:
+                    R_norm2 = 0.0
+                    for jj in range(n_samples):
+                        # R is already multiplied by sample_weight
+                        if sample_weight[jj] != 0:
+                            R_norm2 += (R[jj] ** 2) / sample_weight[jj]
+
+                # w_norm2 = np.dot(w, w)
+                w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1)
+                if (dual_norm_XtA > alpha):
+                    const = alpha / dual_norm_XtA
+                    A_norm2 = R_norm2 * const**2
+                    gap = 0.5 * (R_norm2 + A_norm2)
+                else:
+                    const = 1.0
+                    gap = R_norm2
+
+                l1_norm = _asum(n_features, &w[0], 1)
+
+                gap += (alpha * l1_norm - const * _dot(
+                            n_samples,
+                            &R[0], 1,
+                            &y[0], 1
+                            )
+                        + 0.5 * beta * (1 + const ** 2) * w_norm2)
+
+                if gap < tol:
+                    # return if we reached desired tolerance
+                    break
+
+        else:
+            # for/else, runs if for doesn't end with a `break`
+            with gil:
+                warnings.warn("Objective did not converge. You might want to "
+                              "increase the number of iterations. Duality "
+                              "gap: {}, tolerance: {}".format(gap, tol),
+                              ConvergenceWarning)
+
+    return np.asarray(w), gap, tol, n_iter + 1
+
+
+def enet_coordinate_descent_gram(
+    floating[::1] w,
+    floating alpha,
+    floating beta,
+    const floating[:, ::1] Q,
+    const floating[::1] q,
+    const floating[:] y,
+    unsigned int max_iter,
+    floating tol,
+    object rng,
+    bint random=0,
+    bint positive=0
+):
+    """Cython version of the coordinate descent algorithm
+        for Elastic-Net regression
+
+        We minimize
+
+        (1/2) * w^T Q w - q^T w + alpha norm(w, 1) + (beta/2) * norm(w, 2)^2
+
+        which amount to the Elastic-Net problem when:
+        Q = X^T X (Gram matrix)
+        q = X^T y
+
+    Returns
+    -------
+    w : ndarray of shape (n_features,)
+        ElasticNet coefficients.
+    gap : float
+        Achieved dual gap.
+    tol : float
+        Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
+    n_iter : int
+        Number of coordinate descent iterations.
+    """
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    # get the data information into easy vars
+    cdef unsigned int n_features = Q.shape[0]
+
+    # initial value "Q w" which will be kept of up to date in the iterations
+    cdef floating[:] H = np.dot(Q, w)
+
+    cdef floating[:] XtA = np.zeros(n_features, dtype=dtype)
+    cdef floating tmp
+    cdef floating w_ii
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating q_dot_w
+    cdef floating w_norm2
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating dual_norm_XtA
+    cdef unsigned int ii
+    cdef unsigned int n_iter = 0
+    cdef unsigned int f_iter
+    cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef uint32_t* rand_r_state = &rand_r_state_seed
+
+    cdef floating y_norm2 = np.dot(y, y)
+    cdef floating* w_ptr = &w[0]
+    cdef const floating* Q_ptr = &Q[0, 0]
+    cdef const floating* q_ptr = &q[0]
+    cdef floating* H_ptr = &H[0]
+    cdef floating* XtA_ptr = &XtA[0]
+    tol = tol * y_norm2
+
+    if alpha == 0:
+        warnings.warn(
+            "Coordinate descent without L1 regularization may "
+            "lead to unexpected results and is discouraged. "
+            "Set l1_ratio > 0 to add L1 regularization."
+        )
+
+    with nogil:
+        for n_iter in range(max_iter):
+            w_max = 0.0
+            d_w_max = 0.0
+            for f_iter in range(n_features):  # Loop over coordinates
+                if random:
+                    ii = rand_int(n_features, rand_r_state)
+                else:
+                    ii = f_iter
+
+                if Q[ii, ii] == 0.0:
+                    continue
+
+                w_ii = w[ii]  # Store previous value
+
+                if w_ii != 0.0:
+                    # H -= w_ii * Q[ii]
+                    _axpy(n_features, -w_ii, Q_ptr + ii * n_features, 1,
+                          H_ptr, 1)
+
+                tmp = q[ii] - H[ii]
+
+                if positive and tmp < 0:
+                    w[ii] = 0.0
+                else:
+                    w[ii] = fsign(tmp) * fmax(fabs(tmp) - alpha, 0) \
+                        / (Q[ii, ii] + beta)
+
+                if w[ii] != 0.0:
+                    # H +=  w[ii] * Q[ii] # Update H = X.T X w
+                    _axpy(n_features, w[ii], Q_ptr + ii * n_features, 1,
+                          H_ptr, 1)
+
+                # update the maximum absolute coefficient update
+                d_w_ii = fabs(w[ii] - w_ii)
+                if d_w_ii > d_w_max:
+                    d_w_max = d_w_ii
+
+                if fabs(w[ii]) > w_max:
+                    w_max = fabs(w[ii])
+
+            if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
+                # the biggest coordinate update of this iteration was smaller than
+                # the tolerance: check the duality gap as ultimate stopping
+                # criterion
+
+                # q_dot_w = np.dot(w, q)
+                q_dot_w = _dot(n_features, w_ptr, 1, q_ptr, 1)
+
+                for ii in range(n_features):
+                    XtA[ii] = q[ii] - H[ii] - beta * w[ii]
+                if positive:
+                    dual_norm_XtA = max(n_features, XtA_ptr)
+                else:
+                    dual_norm_XtA = abs_max(n_features, XtA_ptr)
+
+                # temp = np.sum(w * H)
+                tmp = 0.0
+                for ii in range(n_features):
+                    tmp += w[ii] * H[ii]
+                R_norm2 = y_norm2 + tmp - 2.0 * q_dot_w
+
+                # w_norm2 = np.dot(w, w)
+                w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1)
+
+                if (dual_norm_XtA > alpha):
+                    const = alpha / dual_norm_XtA
+                    A_norm2 = R_norm2 * (const ** 2)
+                    gap = 0.5 * (R_norm2 + A_norm2)
+                else:
+                    const = 1.0
+                    gap = R_norm2
+
+                # The call to asum is equivalent to the L1 norm of w
+                gap += (
+                    alpha * _asum(n_features, &w[0], 1)
+                    - const * y_norm2
+                    + const * q_dot_w
+                    + 0.5 * beta * (1 + const ** 2) * w_norm2
+                )
+
+                if gap < tol:
+                    # return if we reached desired tolerance
+                    break
+
+        else:
+            # for/else, runs if for doesn't end with a `break`
+            with gil:
+                warnings.warn("Objective did not converge. You might want to "
+                              "increase the number of iterations. Duality "
+                              "gap: {}, tolerance: {}".format(gap, tol),
+                              ConvergenceWarning)
+
+    return np.asarray(w), gap, tol, n_iter + 1
+
+
+def enet_coordinate_descent_multi_task(
+    const floating[::1, :] W,
+    floating l1_reg,
+    floating l2_reg,
+    const floating[::1, :] X,
+    const floating[::1, :] Y,
+    unsigned int max_iter,
+    floating tol,
+    object rng,
+    bint random=0
+):
+    """Cython version of the coordinate descent algorithm
+        for Elastic-Net mult-task regression
+
+        We minimize
+
+        0.5 * norm(Y - X W.T, 2)^2 + l1_reg ||W.T||_21 + 0.5 * l2_reg norm(W.T, 2)^2
+
+    Returns
+    -------
+    W : ndarray of shape (n_tasks, n_features)
+        ElasticNet coefficients.
+    gap : float
+        Achieved dual gap.
+    tol : float
+        Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
+    n_iter : int
+        Number of coordinate descent iterations.
+    """
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    # get the data information into easy vars
+    cdef unsigned int n_samples = X.shape[0]
+    cdef unsigned int n_features = X.shape[1]
+    cdef unsigned int n_tasks = Y.shape[1]
+
+    # to store XtA
+    cdef floating[:, ::1] XtA = np.zeros((n_features, n_tasks), dtype=dtype)
+    cdef floating XtA_axis1norm
+    cdef floating dual_norm_XtA
+
+    # initial value of the residuals
+    cdef floating[::1, :] R = np.zeros((n_samples, n_tasks), dtype=dtype, order='F')
+
+    cdef floating[::1] norm_cols_X = np.zeros(n_features, dtype=dtype)
+    cdef floating[::1] tmp = np.zeros(n_tasks, dtype=dtype)
+    cdef floating[::1] w_ii = np.zeros(n_tasks, dtype=dtype)
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating nn
+    cdef floating W_ii_abs_max
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating R_norm
+    cdef floating w_norm
+    cdef floating ry_sum
+    cdef floating l21_norm
+    cdef unsigned int ii
+    cdef unsigned int jj
+    cdef unsigned int n_iter = 0
+    cdef unsigned int f_iter
+    cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef uint32_t* rand_r_state = &rand_r_state_seed
+
+    cdef const floating* X_ptr = &X[0, 0]
+    cdef const floating* Y_ptr = &Y[0, 0]
+
+    if l1_reg == 0:
+        warnings.warn(
+            "Coordinate descent with l1_reg=0 may lead to unexpected"
+            " results and is discouraged."
+        )
+
+    with nogil:
+        # norm_cols_X = (np.asarray(X) ** 2).sum(axis=0)
+        for ii in range(n_features):
+            norm_cols_X[ii] = _nrm2(n_samples, X_ptr + ii * n_samples, 1) ** 2
+
+        # R = Y - np.dot(X, W.T)
+        _copy(n_samples * n_tasks, Y_ptr, 1, &R[0, 0], 1)
+        for ii in range(n_features):
+            for jj in range(n_tasks):
+                if W[jj, ii] != 0:
+                    _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                          &R[0, jj], 1)
+
+        # tol = tol * linalg.norm(Y, ord='fro') ** 2
+        tol = tol * _nrm2(n_samples * n_tasks, Y_ptr, 1) ** 2
+
+        for n_iter in range(max_iter):
+            w_max = 0.0
+            d_w_max = 0.0
+            for f_iter in range(n_features):  # Loop over coordinates
+                if random:
+                    ii = rand_int(n_features, rand_r_state)
+                else:
+                    ii = f_iter
+
+                if norm_cols_X[ii] == 0.0:
+                    continue
+
+                # w_ii = W[:, ii] # Store previous value
+                _copy(n_tasks, &W[0, ii], 1, &w_ii[0], 1)
+
+                # Using Numpy:
+                # R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update
+                # Using Blas Level2:
+                # _ger(RowMajor, n_samples, n_tasks, 1.0,
+                #      &X[0, ii], 1,
+                #      &w_ii[0], 1, &R[0, 0], n_tasks)
+                # Using Blas Level1 and for loop to avoid slower threads
+                # for such small vectors
+                for jj in range(n_tasks):
+                    if w_ii[jj] != 0:
+                        _axpy(n_samples, w_ii[jj], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
+
+                # Using numpy:
+                # tmp = np.dot(X[:, ii][None, :], R).ravel()
+                # Using BLAS Level 2:
+                # _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0],
+                #       n_tasks, &X[0, ii], 1, 0.0, &tmp[0], 1)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    tmp[jj] = _dot(n_samples, X_ptr + ii * n_samples, 1,
+                                   &R[0, jj], 1)
+
+                # nn = sqrt(np.sum(tmp ** 2))
+                nn = _nrm2(n_tasks, &tmp[0], 1)
+
+                # W[:, ii] = tmp * fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg)
+                _copy(n_tasks, &tmp[0], 1, &W[0, ii], 1)
+                _scal(n_tasks, fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg),
+                      &W[0, ii], 1)
+
+                # Using numpy:
+                # R -= np.dot(X[:, ii][:, None], W[:, ii][None, :])
+                # Using BLAS Level 2:
+                # Update residual : rank 1 update
+                # _ger(RowMajor, n_samples, n_tasks, -1.0,
+                #      &X[0, ii], 1, &W[0, ii], 1,
+                #      &R[0, 0], n_tasks)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    if W[jj, ii] != 0:
+                        _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
+
+                # update the maximum absolute coefficient update
+                d_w_ii = diff_abs_max(n_tasks, &W[0, ii], &w_ii[0])
+
+                if d_w_ii > d_w_max:
+                    d_w_max = d_w_ii
+
+                W_ii_abs_max = abs_max(n_tasks, &W[0, ii])
+                if W_ii_abs_max > w_max:
+                    w_max = W_ii_abs_max
+
+            if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
+                # the biggest coordinate update of this iteration was smaller than
+                # the tolerance: check the duality gap as ultimate stopping
+                # criterion
+
+                # XtA = np.dot(X.T, R) - l2_reg * W.T
+                for ii in range(n_features):
+                    for jj in range(n_tasks):
+                        XtA[ii, jj] = _dot(
+                            n_samples, X_ptr + ii * n_samples, 1, &R[0, jj], 1
+                            ) - l2_reg * W[jj, ii]
+
+                # dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=1)))
+                dual_norm_XtA = 0.0
+                for ii in range(n_features):
+                    # np.sqrt(np.sum(XtA ** 2, axis=1))
+                    XtA_axis1norm = _nrm2(n_tasks, &XtA[ii, 0], 1)
+                    if XtA_axis1norm > dual_norm_XtA:
+                        dual_norm_XtA = XtA_axis1norm
+
+                # TODO: use squared L2 norm directly
+                # R_norm = linalg.norm(R, ord='fro')
+                # w_norm = linalg.norm(W, ord='fro')
+                R_norm = _nrm2(n_samples * n_tasks, &R[0, 0], 1)
+                w_norm = _nrm2(n_features * n_tasks, &W[0, 0], 1)
+                if (dual_norm_XtA > l1_reg):
+                    const = l1_reg / dual_norm_XtA
+                    A_norm = R_norm * const
+                    gap = 0.5 * (R_norm ** 2 + A_norm ** 2)
+                else:
+                    const = 1.0
+                    gap = R_norm ** 2
+
+                # ry_sum = np.sum(R * y)
+                ry_sum = _dot(n_samples * n_tasks, &R[0, 0], 1, &Y[0, 0], 1)
+
+                # l21_norm = np.sqrt(np.sum(W ** 2, axis=0)).sum()
+                l21_norm = 0.0
+                for ii in range(n_features):
+                    l21_norm += _nrm2(n_tasks, &W[0, ii], 1)
+
+                gap += (
+                    l1_reg * l21_norm
+                    - const * ry_sum
+                    + 0.5 * l2_reg * (1 + const ** 2) * (w_norm ** 2)
+                )
+
+                if gap < tol:
+                    # return if we reached desired tolerance
+                    break
+        else:
+            # for/else, runs if for doesn't end with a `break`
+            with gil:
+                warnings.warn("Objective did not converge. You might want to "
+                              "increase the number of iterations. Duality "
+                              "gap: {}, tolerance: {}".format(gap, tol),
+                              ConvergenceWarning)
+
+    return np.asarray(W), gap, tol, n_iter + 1
@@ -0,0 +1,15 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+from .glm import (
+    GammaRegressor,
+    PoissonRegressor,
+    TweedieRegressor,
+    _GeneralizedLinearRegressor,
+)
+
+__all__ = [
+    "_GeneralizedLinearRegressor",
+    "PoissonRegressor",
+    "GammaRegressor",
+    "TweedieRegressor",
+]
@@ -0,0 +1,523 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+"""
+Newton solver for Generalized Linear Models
+"""
+
+import warnings
+from abc import ABC, abstractmethod
+
+import numpy as np
+import scipy.linalg
+import scipy.optimize
+
+from ..._loss.loss import HalfSquaredError
+from ...exceptions import ConvergenceWarning
+from ...utils.optimize import _check_optimize_result
+from .._linear_loss import LinearModelLoss
+
+
+class NewtonSolver(ABC):
+    """Newton solver for GLMs.
+
+    This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
+    iteration aims at finding the Newton step which is done by the inner solver. With
+    Hessian H, gradient g and coefficients coef, one step solves:
+
+        H @ coef_newton = -g
+
+    For our GLM / LinearModelLoss, we have gradient g and Hessian H:
+
+        g = X.T @ loss.gradient + l2_reg_strength * coef
+        H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
+
+    Backtracking line search updates coef = coef_old + t * coef_newton for some t in
+    (0, 1].
+
+    This is a base class, actual implementations (child classes) may deviate from the
+    above pattern and use structure specific tricks.
+
+    Usage pattern:
+        - initialize solver: sol = NewtonSolver(...)
+        - solve the problem: sol.solve(X, y, sample_weight)
+
+    References
+    ----------
+    - Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization"
+      2nd edition
+      https://doi.org/10.1007/978-0-387-40065-5
+
+    - Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization."
+      Cambridge University Press, 2004.
+      https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
+
+    Parameters
+    ----------
+    coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+        Initial coefficients of a linear model.
+        If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+        i.e. one reconstructs the 2d-array via
+        coef.reshape((n_classes, -1), order="F").
+
+    linear_loss : LinearModelLoss
+        The loss to be minimized.
+
+    l2_reg_strength : float, default=0.0
+        L2 regularization strength.
+
+    tol : float, default=1e-4
+        The optimization problem is solved when each of the following condition is
+        fulfilled:
+        1. maximum |gradient| <= tol
+        2. Newton decrement d: 1/2 * d^2 <= tol
+
+    max_iter : int, default=100
+        Maximum number of Newton steps allowed.
+
+    n_threads : int, default=1
+        Number of OpenMP threads to use for the computation of the Hessian and gradient
+        of the loss function.
+
+    Attributes
+    ----------
+    coef_old : ndarray of shape coef.shape
+        Coefficient of previous iteration.
+
+    coef_newton : ndarray of shape coef.shape
+        Newton step.
+
+    gradient : ndarray of shape coef.shape
+        Gradient of the loss w.r.t. the coefficients.
+
+    gradient_old : ndarray of shape coef.shape
+        Gradient of previous iteration.
+
+    loss_value : float
+        Value of objective function = loss + penalty.
+
+    loss_value_old : float
+        Value of objective function of previous itertion.
+
+    raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes)
+
+    converged : bool
+        Indicator for convergence of the solver.
+
+    iteration : int
+        Number of Newton steps, i.e. calls to inner_solve
+
+    use_fallback_lbfgs_solve : bool
+        If set to True, the solver will resort to call LBFGS to finish the optimisation
+        procedure in case of convergence issues.
+
+    gradient_times_newton : float
+        gradient @ coef_newton, set in inner_solve and used by line_search. If the
+        Newton step is a descent direction, this is negative.
+    """
+
+    def __init__(
+        self,
+        *,
+        coef,
+        linear_loss=LinearModelLoss(base_loss=HalfSquaredError(), fit_intercept=True),
+        l2_reg_strength=0.0,
+        tol=1e-4,
+        max_iter=100,
+        n_threads=1,
+        verbose=0,
+    ):
+        self.coef = coef
+        self.linear_loss = linear_loss
+        self.l2_reg_strength = l2_reg_strength
+        self.tol = tol
+        self.max_iter = max_iter
+        self.n_threads = n_threads
+        self.verbose = verbose
+
+    def setup(self, X, y, sample_weight):
+        """Precomputations
+
+        If None, initializes:
+            - self.coef
+        Sets:
+            - self.raw_prediction
+            - self.loss_value
+        """
+        _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
+        self.loss_value = self.linear_loss.loss(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            raw_prediction=self.raw_prediction,
+        )
+
+    @abstractmethod
+    def update_gradient_hessian(self, X, y, sample_weight):
+        """Update gradient and Hessian."""
+
+    @abstractmethod
+    def inner_solve(self, X, y, sample_weight):
+        """Compute Newton step.
+
+        Sets:
+            - self.coef_newton
+            - self.gradient_times_newton
+        """
+
+    def fallback_lbfgs_solve(self, X, y, sample_weight):
+        """Fallback solver in case of emergency.
+
+        If a solver detects convergence problems, it may fall back to this methods in
+        the hope to exit with success instead of raising an error.
+
+        Sets:
+            - self.coef
+            - self.converged
+        """
+        opt_res = scipy.optimize.minimize(
+            self.linear_loss.loss_gradient,
+            self.coef,
+            method="L-BFGS-B",
+            jac=True,
+            options={
+                "maxiter": self.max_iter,
+                "maxls": 50,  # default is 20
+                "iprint": self.verbose - 1,
+                "gtol": self.tol,
+                "ftol": 64 * np.finfo(np.float64).eps,
+            },
+            args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),
+        )
+        self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
+        self.coef = opt_res.x
+        self.converged = opt_res.status == 0
+
+    def line_search(self, X, y, sample_weight):
+        """Backtracking line search.
+
+        Sets:
+            - self.coef_old
+            - self.coef
+            - self.loss_value_old
+            - self.loss_value
+            - self.gradient_old
+            - self.gradient
+            - self.raw_prediction
+        """
+        # line search parameters
+        beta, sigma = 0.5, 0.00048828125  # 1/2, 1/2**11
+        eps = 16 * np.finfo(self.loss_value.dtype).eps
+        t = 1  # step size
+
+        # gradient_times_newton = self.gradient @ self.coef_newton
+        # was computed in inner_solve.
+        armijo_term = sigma * self.gradient_times_newton
+        _, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(
+            self.coef_newton, X
+        )
+
+        self.coef_old = self.coef
+        self.loss_value_old = self.loss_value
+        self.gradient_old = self.gradient
+
+        # np.sum(np.abs(self.gradient_old))
+        sum_abs_grad_old = -1
+
+        is_verbose = self.verbose >= 2
+        if is_verbose:
+            print("  Backtracking Line Search")
+            print(f"    eps=16 * finfo.eps={eps}")
+
+        for i in range(21):  # until and including t = beta**20 ~ 1e-6
+            self.coef = self.coef_old + t * self.coef_newton
+            raw = self.raw_prediction + t * raw_prediction_newton
+            self.loss_value, self.gradient = self.linear_loss.loss_gradient(
+                coef=self.coef,
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+                l2_reg_strength=self.l2_reg_strength,
+                n_threads=self.n_threads,
+                raw_prediction=raw,
+            )
+            # Note: If coef_newton is too large, loss_gradient may produce inf values,
+            # potentially accompanied by a RuntimeWarning.
+            # This case will be captured by the Armijo condition.
+
+            # 1. Check Armijo / sufficient decrease condition.
+            # The smaller (more negative) the better.
+            loss_improvement = self.loss_value - self.loss_value_old
+            check = loss_improvement <= t * armijo_term
+            if is_verbose:
+                print(
+                    f"    line search iteration={i+1}, step size={t}\n"
+                    f"      check loss improvement <= armijo term: {loss_improvement} "
+                    f"<= {t * armijo_term} {check}"
+                )
+            if check:
+                break
+            # 2. Deal with relative loss differences around machine precision.
+            tiny_loss = np.abs(self.loss_value_old * eps)
+            check = np.abs(loss_improvement) <= tiny_loss
+            if is_verbose:
+                print(
+                    "      check loss |improvement| <= eps * |loss_old|:"
+                    f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
+                )
+            if check:
+                if sum_abs_grad_old < 0:
+                    sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1)
+                # 2.1 Check sum of absolute gradients as alternative condition.
+                sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1)
+                check = sum_abs_grad < sum_abs_grad_old
+                if is_verbose:
+                    print(
+                        "      check sum(|gradient|) < sum(|gradient_old|): "
+                        f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
+                    )
+                if check:
+                    break
+
+            t *= beta
+        else:
+            warnings.warn(
+                (
+                    f"Line search of Newton solver {self.__class__.__name__} at"
+                    f" iteration #{self.iteration} did no converge after 21 line search"
+                    " refinement iterations. It will now resort to lbfgs instead."
+                ),
+                ConvergenceWarning,
+            )
+            if self.verbose:
+                print("  Line search did not converge and resorts to lbfgs instead.")
+            self.use_fallback_lbfgs_solve = True
+            return
+
+        self.raw_prediction = raw
+
+    def check_convergence(self, X, y, sample_weight):
+        """Check for convergence.
+
+        Sets self.converged.
+        """
+        if self.verbose:
+            print("  Check Convergence")
+        # Note: Checking maximum relative change of coefficient <= tol is a bad
+        # convergence criterion because even a large step could have brought us close
+        # to the true minimum.
+        # coef_step = self.coef - self.coef_old
+        # check = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old)))
+
+        # 1. Criterion: maximum |gradient| <= tol
+        #    The gradient was already updated in line_search()
+        check = np.max(np.abs(self.gradient))
+        if self.verbose:
+            print(f"    1. max |gradient| {check} <= {self.tol}")
+        if check > self.tol:
+            return
+
+        # 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol
+        #       d = sqrt(grad @ hessian^-1 @ grad)
+        #         = sqrt(coef_newton @ hessian @ coef_newton)
+        #    See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.
+        d2 = self.coef_newton @ self.hessian @ self.coef_newton
+        if self.verbose:
+            print(f"    2. Newton decrement {0.5 * d2} <= {self.tol}")
+        if 0.5 * d2 > self.tol:
+            return
+
+        if self.verbose:
+            loss_value = self.linear_loss.loss(
+                coef=self.coef,
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+                l2_reg_strength=self.l2_reg_strength,
+                n_threads=self.n_threads,
+            )
+            print(f"  Solver did converge at loss = {loss_value}.")
+        self.converged = True
+
+    def finalize(self, X, y, sample_weight):
+        """Finalize the solvers results.
+
+        Some solvers may need this, others not.
+        """
+        pass
+
+    def solve(self, X, y, sample_weight):
+        """Solve the optimization problem.
+
+        This is the main routine.
+
+        Order of calls:
+            self.setup()
+            while iteration:
+                self.update_gradient_hessian()
+                self.inner_solve()
+                self.line_search()
+                self.check_convergence()
+            self.finalize()
+
+        Returns
+        -------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Solution of the optimization problem.
+        """
+        # setup usually:
+        #   - initializes self.coef if needed
+        #   - initializes and calculates self.raw_predictions, self.loss_value
+        self.setup(X=X, y=y, sample_weight=sample_weight)
+
+        self.iteration = 1
+        self.converged = False
+        self.use_fallback_lbfgs_solve = False
+
+        while self.iteration <= self.max_iter and not self.converged:
+            if self.verbose:
+                print(f"Newton iter={self.iteration}")
+
+            self.use_fallback_lbfgs_solve = False  # Fallback solver.
+
+            # 1. Update Hessian and gradient
+            self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
+
+            # TODO:
+            # if iteration == 1:
+            # We might stop early, e.g. we already are close to the optimum,
+            # usually detected by zero gradients at this stage.
+
+            # 2. Inner solver
+            #    Calculate Newton step/direction
+            #    This usually sets self.coef_newton and self.gradient_times_newton.
+            self.inner_solve(X=X, y=y, sample_weight=sample_weight)
+            if self.use_fallback_lbfgs_solve:
+                break
+
+            # 3. Backtracking line search
+            #    This usually sets self.coef_old, self.coef, self.loss_value_old
+            #    self.loss_value, self.gradient_old, self.gradient,
+            #    self.raw_prediction.
+            self.line_search(X=X, y=y, sample_weight=sample_weight)
+            if self.use_fallback_lbfgs_solve:
+                break
+
+            # 4. Check convergence
+            #    Sets self.converged.
+            self.check_convergence(X=X, y=y, sample_weight=sample_weight)
+
+            # 5. Next iteration
+            self.iteration += 1
+
+        if not self.converged:
+            if self.use_fallback_lbfgs_solve:
+                # Note: The fallback solver circumvents check_convergence and relies on
+                # the convergence checks of lbfgs instead. Enough warnings have been
+                # raised on the way.
+                self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)
+            else:
+                warnings.warn(
+                    (
+                        f"Newton solver did not converge after {self.iteration - 1} "
+                        "iterations."
+                    ),
+                    ConvergenceWarning,
+                )
+
+        self.iteration -= 1
+        self.finalize(X=X, y=y, sample_weight=sample_weight)
+        return self.coef
+
+
+class NewtonCholeskySolver(NewtonSolver):
+    """Cholesky based Newton solver.
+
+    Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
+    solver.
+    """
+
+    def setup(self, X, y, sample_weight):
+        super().setup(X=X, y=y, sample_weight=sample_weight)
+        n_dof = X.shape[1]
+        if self.linear_loss.fit_intercept:
+            n_dof += 1
+        self.gradient = np.empty_like(self.coef)
+        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
+
+    def update_gradient_hessian(self, X, y, sample_weight):
+        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            gradient_out=self.gradient,
+            hessian_out=self.hessian,
+            raw_prediction=self.raw_prediction,  # this was updated in line_search
+        )
+
+    def inner_solve(self, X, y, sample_weight):
+        if self.hessian_warning:
+            warnings.warn(
+                (
+                    f"The inner solver of {self.__class__.__name__} detected a "
+                    "pointwise hessian with many negative values at iteration "
+                    f"#{self.iteration}. It will now resort to lbfgs instead."
+                ),
+                ConvergenceWarning,
+            )
+            if self.verbose:
+                print(
+                    "  The inner solver detected a pointwise Hessian with many "
+                    "negative values and resorts to lbfgs instead."
+                )
+            self.use_fallback_lbfgs_solve = True
+            return
+
+        try:
+            with warnings.catch_warnings():
+                warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
+                self.coef_newton = scipy.linalg.solve(
+                    self.hessian, -self.gradient, check_finite=False, assume_a="sym"
+                )
+                self.gradient_times_newton = self.gradient @ self.coef_newton
+                if self.gradient_times_newton > 0:
+                    if self.verbose:
+                        print(
+                            "  The inner solver found a Newton step that is not a "
+                            "descent direction and resorts to LBFGS steps instead."
+                        )
+                    self.use_fallback_lbfgs_solve = True
+                    return
+        except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
+            warnings.warn(
+                f"The inner solver of {self.__class__.__name__} stumbled upon a "
+                "singular or very ill-conditioned Hessian matrix at iteration "
+                f"#{self.iteration}. It will now resort to lbfgs instead.\n"
+                "Further options are to use another solver or to avoid such situation "
+                "in the first place. Possible remedies are removing collinear features"
+                " of X or increasing the penalization strengths.\n"
+                "The original Linear Algebra message was:\n" + str(e),
+                scipy.linalg.LinAlgWarning,
+            )
+            # Possible causes:
+            # 1. hess_pointwise is negative. But this is already taken care in
+            #    LinearModelLoss.gradient_hessian.
+            # 2. X is singular or ill-conditioned
+            #    This might be the most probable cause.
+            #
+            # There are many possible ways to deal with this situation. Most of them
+            # add, explicitly or implicitly, a matrix to the hessian to make it
+            # positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
+            # Instead, we resort to lbfgs.
+            if self.verbose:
+                print(
+                    "  The inner solver stumbled upon an singular or ill-conditioned "
+                    "Hessian matrix and resorts to LBFGS instead."
+                )
+            self.use_fallback_lbfgs_solve = True
+            return
@@ -0,0 +1,902 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+"""
+Generalized Linear Models with Exponential Dispersion Family
+"""
+
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.optimize
+
+from ..._loss.loss import (
+    HalfGammaLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    HalfTweedieLossIdentity,
+)
+from ...base import BaseEstimator, RegressorMixin, _fit_context
+from ...utils import check_array
+from ...utils._openmp_helpers import _openmp_effective_n_threads
+from ...utils._param_validation import Hidden, Interval, StrOptions
+from ...utils.optimize import _check_optimize_result
+from ...utils.validation import _check_sample_weight, check_is_fitted
+from .._linear_loss import LinearModelLoss
+from ._newton_solver import NewtonCholeskySolver, NewtonSolver
+
+
+class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
+    """Regression via a penalized Generalized Linear Model (GLM).
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and
+    predicting the mean of the target y as y_pred=h(X*w) with coefficients w.
+    Therefore, the fit minimizes the following objective function with L2 priors as
+    regularizer::
+
+        1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2
+
+    with inverse link function h, s=sample_weight and per observation (unit) deviance
+    deviance(y_i, h(x_i*w)). Note that for an EDM, 1/2 * deviance is the negative
+    log-likelihood up to a constant (in w) term.
+    The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
+
+    Instead of implementing the EDM family and a link function separately, we directly
+    use the loss functions `from sklearn._loss` which have the link functions included
+    in them for performance reasons. We pick the loss functions that implement
+    (1/2 times) EDM deviances.
+
+    Read more in the :ref:`User Guide <Generalized_linear_models>`.
+
+    .. versionadded:: 0.23
+
+    Parameters
+    ----------
+    alpha : float, default=1
+        Constant that multiplies the penalty term and thus determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+        Values must be in the range `[0.0, inf)`.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X @ coef + intercept).
+
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is a good choice for `n_samples` >> `n_features`, especially
+            with one-hot encoded categorical features with rare categories. Be aware
+            that the memory usage of this solver has a quadratic dependency on
+            `n_features` because it explicitly computes the Hessian matrix.
+
+            .. versionadded:: 1.2
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+        Values must be in the range `[1, inf)`.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+        Values must be in the range `(0.0, inf)`.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_``.
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+        Values must be in the range `[0, inf)`.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+
+    _base_loss : BaseLoss, default=HalfSquaredError()
+        This is set during fit via `self._get_loss()`.
+        A `_base_loss` contains a specific loss function as well as the link
+        function. The loss to be minimized specifies the distributional assumption of
+        the GLM, i.e. the distribution from the EDM. Here are some examples:
+
+        =======================  ========  ==========================
+        _base_loss               Link      Target Domain
+        =======================  ========  ==========================
+        HalfSquaredError         identity  y any real number
+        HalfPoissonLoss          log       0 <= y
+        HalfGammaLoss            log       0 < y
+        HalfTweedieLoss          log       dependent on tweedie power
+        HalfTweedieLossIdentity  identity  dependent on tweedie power
+        =======================  ========  ==========================
+
+        The link function of the GLM, i.e. mapping from linear predictor
+        `X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link,
+        we have `y_pred = exp(X @ coeff + intercept)`.
+    """
+
+    # We allow for NewtonSolver classes for the "solver" parameter but do not
+    # make them public in the docstrings. This facilitates testing and
+    # benchmarking.
+    _parameter_constraints: dict = {
+        "alpha": [Interval(Real, 0.0, None, closed="left")],
+        "fit_intercept": ["boolean"],
+        "solver": [
+            StrOptions({"lbfgs", "newton-cholesky"}),
+            Hidden(type),
+        ],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0.0, None, closed="neither")],
+        "warm_start": ["boolean"],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        alpha=1.0,
+        fit_intercept=True,
+        solver="lbfgs",
+        max_iter=100,
+        tol=1e-4,
+        warm_start=False,
+        verbose=0,
+    ):
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.solver = solver
+        self.max_iter = max_iter
+        self.tol = tol
+        self.warm_start = warm_start
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit a Generalized Linear Model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : object
+            Fitted model.
+        """
+        X, y = self._validate_data(
+            X,
+            y,
+            accept_sparse=["csc", "csr"],
+            dtype=[np.float64, np.float32],
+            y_numeric=True,
+            multi_output=False,
+        )
+
+        # required by losses
+        if self.solver == "lbfgs":
+            # lbfgs will force coef and therefore raw_prediction to be float64. The
+            # base_loss needs y, X @ coef and sample_weight all of same dtype
+            # (and contiguous).
+            loss_dtype = np.float64
+        else:
+            loss_dtype = min(max(y.dtype, X.dtype), np.float64)
+        y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False)
+
+        if sample_weight is not None:
+            # Note that _check_sample_weight calls check_array(order="C") required by
+            # losses.
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
+
+        n_samples, n_features = X.shape
+        self._base_loss = self._get_loss()
+
+        linear_loss = LinearModelLoss(
+            base_loss=self._base_loss,
+            fit_intercept=self.fit_intercept,
+        )
+
+        if not linear_loss.base_loss.in_y_true_range(y):
+            raise ValueError(
+                "Some value(s) of y are out of the valid range of the loss"
+                f" {self._base_loss.__class__.__name__!r}."
+            )
+
+        # TODO: if alpha=0 check that X is not rank deficient
+
+        # NOTE: Rescaling of sample_weight:
+        # We want to minimize
+        #     obj = 1/(2 * sum(sample_weight)) * sum(sample_weight * deviance)
+        #         + 1/2 * alpha * L2,
+        # with
+        #     deviance = 2 * loss.
+        # The objective is invariant to multiplying sample_weight by a constant. We
+        # could choose this constant such that sum(sample_weight) = 1 in order to end
+        # up with
+        #     obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
+        # But LinearModelLoss.loss() already computes
+        #     average(loss, weights=sample_weight)
+        # Thus, without rescaling, we have
+        #     obj = LinearModelLoss.loss(...)
+
+        if self.warm_start and hasattr(self, "coef_"):
+            if self.fit_intercept:
+                # LinearModelLoss needs intercept at the end of coefficient array.
+                coef = np.concatenate((self.coef_, np.array([self.intercept_])))
+            else:
+                coef = self.coef_
+            coef = coef.astype(loss_dtype, copy=False)
+        else:
+            coef = linear_loss.init_zero_coef(X, dtype=loss_dtype)
+            if self.fit_intercept:
+                coef[-1] = linear_loss.base_loss.link.link(
+                    np.average(y, weights=sample_weight)
+                )
+
+        l2_reg_strength = self.alpha
+        n_threads = _openmp_effective_n_threads()
+
+        # Algorithms for optimization:
+        # Note again that our losses implement 1/2 * deviance.
+        if self.solver == "lbfgs":
+            func = linear_loss.loss_gradient
+
+            opt_res = scipy.optimize.minimize(
+                func,
+                coef,
+                method="L-BFGS-B",
+                jac=True,
+                options={
+                    "maxiter": self.max_iter,
+                    "maxls": 50,  # default is 20
+                    "iprint": self.verbose - 1,
+                    "gtol": self.tol,
+                    # The constant 64 was found empirically to pass the test suite.
+                    # The point is that ftol is very small, but a bit larger than
+                    # machine precision for float64, which is the dtype used by lbfgs.
+                    "ftol": 64 * np.finfo(float).eps,
+                },
+                args=(X, y, sample_weight, l2_reg_strength, n_threads),
+            )
+            self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
+            coef = opt_res.x
+        elif self.solver == "newton-cholesky":
+            sol = NewtonCholeskySolver(
+                coef=coef,
+                linear_loss=linear_loss,
+                l2_reg_strength=l2_reg_strength,
+                tol=self.tol,
+                max_iter=self.max_iter,
+                n_threads=n_threads,
+                verbose=self.verbose,
+            )
+            coef = sol.solve(X, y, sample_weight)
+            self.n_iter_ = sol.iteration
+        elif issubclass(self.solver, NewtonSolver):
+            sol = self.solver(
+                coef=coef,
+                linear_loss=linear_loss,
+                l2_reg_strength=l2_reg_strength,
+                tol=self.tol,
+                max_iter=self.max_iter,
+                n_threads=n_threads,
+            )
+            coef = sol.solve(X, y, sample_weight)
+            self.n_iter_ = sol.iteration
+        else:
+            raise ValueError(f"Invalid solver={self.solver}.")
+
+        if self.fit_intercept:
+            self.intercept_ = coef[-1]
+            self.coef_ = coef[:-1]
+        else:
+            # set intercept to zero as the other linear models do
+            self.intercept_ = 0.0
+            self.coef_ = coef
+
+        return self
+
+    def _linear_predictor(self, X):
+        """Compute the linear_predictor = `X @ coef_ + intercept_`.
+
+        Note that we often use the term raw_prediction instead of linear predictor.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        y_pred : array of shape (n_samples,)
+            Returns predicted values of linear predictor.
+        """
+        check_is_fitted(self)
+        X = self._validate_data(
+            X,
+            accept_sparse=["csr", "csc", "coo"],
+            dtype=[np.float64, np.float32],
+            ensure_2d=True,
+            allow_nd=False,
+            reset=False,
+        )
+        return X @ self.coef_ + self.intercept_
+
+    def predict(self, X):
+        """Predict using GLM with feature matrix X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        y_pred : array of shape (n_samples,)
+            Returns predicted values.
+        """
+        # check_array is done in _linear_predictor
+        raw_prediction = self._linear_predictor(X)
+        y_pred = self._base_loss.link.inverse(raw_prediction)
+        return y_pred
+
+    def score(self, X, y, sample_weight=None):
+        """Compute D^2, the percentage of deviance explained.
+
+        D^2 is a generalization of the coefficient of determination R^2.
+        R^2 uses squared error and D^2 uses the deviance of this GLM, see the
+        :ref:`User Guide <regression_metrics>`.
+
+        D^2 is defined as
+        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
+        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
+        The mean :math:`\\bar{y}` is averaged by sample_weight.
+        Best possible score is 1.0 and it can be negative (because the model
+        can be arbitrarily worse).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,)
+            True values of target.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            D^2 of self.predict(X) w.r.t. y.
+        """
+        # TODO: Adapt link to User Guide in the docstring, once
+        # https://github.com/scikit-learn/scikit-learn/pull/22118 is merged.
+        #
+        # Note, default score defined in RegressorMixin is R^2 score.
+        # TODO: make D^2 a score function in module metrics (and thereby get
+        #       input validation and so on)
+        raw_prediction = self._linear_predictor(X)  # validates X
+        # required by losses
+        y = check_array(y, dtype=raw_prediction.dtype, order="C", ensure_2d=False)
+
+        if sample_weight is not None:
+            # Note that _check_sample_weight calls check_array(order="C") required by
+            # losses.
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=y.dtype)
+
+        base_loss = self._base_loss
+
+        if not base_loss.in_y_true_range(y):
+            raise ValueError(
+                "Some value(s) of y are out of the valid range of the loss"
+                f" {base_loss.__name__}."
+            )
+
+        constant = np.average(
+            base_loss.constant_to_optimal_zero(y_true=y, sample_weight=None),
+            weights=sample_weight,
+        )
+
+        # Missing factor of 2 in deviance cancels out.
+        deviance = base_loss(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            n_threads=1,
+        )
+        y_mean = base_loss.link.link(np.average(y, weights=sample_weight))
+        deviance_null = base_loss(
+            y_true=y,
+            raw_prediction=np.tile(y_mean, y.shape[0]),
+            sample_weight=sample_weight,
+            n_threads=1,
+        )
+        return 1 - (deviance + constant) / (deviance_null + constant)
+
+    def _more_tags(self):
+        try:
+            # Create instance of BaseLoss if fit wasn't called yet. This is necessary as
+            # TweedieRegressor might set the used loss during fit different from
+            # self._base_loss.
+            base_loss = self._get_loss()
+            return {"requires_positive_y": not base_loss.in_y_true_range(-1.0)}
+        except (ValueError, AttributeError, TypeError):
+            # This happens when the link or power parameter of TweedieRegressor is
+            # invalid. We fallback on the default tags in that case.
+            return {}
+
+    def _get_loss(self):
+        """This is only necessary because of the link and power arguments of the
+        TweedieRegressor.
+
+        Note that we do not need to pass sample_weight to the loss class as this is
+        only needed to set loss.constant_hessian on which GLMs do not rely.
+        """
+        return HalfSquaredError()
+
+
+class PoissonRegressor(_GeneralizedLinearRegressor):
+    """Generalized Linear Model with a Poisson distribution.
+
+    This regressor uses the 'log' link function.
+
+    Read more in the :ref:`User Guide <Generalized_linear_models>`.
+
+    .. versionadded:: 0.23
+
+    Parameters
+    ----------
+    alpha : float, default=1
+        Constant that multiplies the L2 penalty term and determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+        Values of `alpha` must be in the range `[0.0, inf)`.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (`X @ coef + intercept`).
+
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is a good choice for `n_samples` >> `n_features`, especially
+            with one-hot encoded categorical features with rare categories. Be aware
+            that the memory usage of this solver has a quadratic dependency on
+            `n_features` because it explicitly computes the Hessian matrix.
+
+            .. versionadded:: 1.2
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+        Values must be in the range `[1, inf)`.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+        Values must be in the range `(0.0, inf)`.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` .
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+        Values must be in the range `[0, inf)`.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+
+    See Also
+    --------
+    TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.PoissonRegressor()
+    >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
+    >>> y = [12, 17, 22, 21]
+    >>> clf.fit(X, y)
+    PoissonRegressor()
+    >>> clf.score(X, y)
+    0.990...
+    >>> clf.coef_
+    array([0.121..., 0.158...])
+    >>> clf.intercept_
+    2.088...
+    >>> clf.predict([[1, 1], [3, 4]])
+    array([10.676..., 21.875...])
+    """
+
+    _parameter_constraints: dict = {
+        **_GeneralizedLinearRegressor._parameter_constraints
+    }
+
+    def __init__(
+        self,
+        *,
+        alpha=1.0,
+        fit_intercept=True,
+        solver="lbfgs",
+        max_iter=100,
+        tol=1e-4,
+        warm_start=False,
+        verbose=0,
+    ):
+        super().__init__(
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            solver=solver,
+            max_iter=max_iter,
+            tol=tol,
+            warm_start=warm_start,
+            verbose=verbose,
+        )
+
+    def _get_loss(self):
+        return HalfPoissonLoss()
+
+
+class GammaRegressor(_GeneralizedLinearRegressor):
+    """Generalized Linear Model with a Gamma distribution.
+
+    This regressor uses the 'log' link function.
+
+    Read more in the :ref:`User Guide <Generalized_linear_models>`.
+
+    .. versionadded:: 0.23
+
+    Parameters
+    ----------
+    alpha : float, default=1
+        Constant that multiplies the L2 penalty term and determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+        Values of `alpha` must be in the range `[0.0, inf)`.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor `X @ coef_ + intercept_`.
+
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is a good choice for `n_samples` >> `n_features`, especially
+            with one-hot encoded categorical features with rare categories. Be aware
+            that the memory usage of this solver has a quadratic dependency on
+            `n_features` because it explicitly computes the Hessian matrix.
+
+            .. versionadded:: 1.2
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+        Values must be in the range `[1, inf)`.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+        Values must be in the range `(0.0, inf)`.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for `coef_` and `intercept_`.
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+        Values must be in the range `[0, inf)`.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PoissonRegressor : Generalized Linear Model with a Poisson distribution.
+    TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.GammaRegressor()
+    >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
+    >>> y = [19, 26, 33, 30]
+    >>> clf.fit(X, y)
+    GammaRegressor()
+    >>> clf.score(X, y)
+    0.773...
+    >>> clf.coef_
+    array([0.072..., 0.066...])
+    >>> clf.intercept_
+    2.896...
+    >>> clf.predict([[1, 0], [2, 8]])
+    array([19.483..., 35.795...])
+    """
+
+    _parameter_constraints: dict = {
+        **_GeneralizedLinearRegressor._parameter_constraints
+    }
+
+    def __init__(
+        self,
+        *,
+        alpha=1.0,
+        fit_intercept=True,
+        solver="lbfgs",
+        max_iter=100,
+        tol=1e-4,
+        warm_start=False,
+        verbose=0,
+    ):
+        super().__init__(
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            solver=solver,
+            max_iter=max_iter,
+            tol=tol,
+            warm_start=warm_start,
+            verbose=verbose,
+        )
+
+    def _get_loss(self):
+        return HalfGammaLoss()
+
+
+class TweedieRegressor(_GeneralizedLinearRegressor):
+    """Generalized Linear Model with a Tweedie distribution.
+
+    This estimator can be used to model different GLMs depending on the
+    ``power`` parameter, which determines the underlying distribution.
+
+    Read more in the :ref:`User Guide <Generalized_linear_models>`.
+
+    .. versionadded:: 0.23
+
+    Parameters
+    ----------
+    power : float, default=0
+            The power determines the underlying target distribution according
+            to the following table:
+
+            +-------+------------------------+
+            | Power | Distribution           |
+            +=======+========================+
+            | 0     | Normal                 |
+            +-------+------------------------+
+            | 1     | Poisson                |
+            +-------+------------------------+
+            | (1,2) | Compound Poisson Gamma |
+            +-------+------------------------+
+            | 2     | Gamma                  |
+            +-------+------------------------+
+            | 3     | Inverse Gaussian       |
+            +-------+------------------------+
+
+            For ``0 < power < 1``, no distribution exists.
+
+    alpha : float, default=1
+        Constant that multiplies the L2 penalty term and determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+        Values of `alpha` must be in the range `[0.0, inf)`.
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (`X @ coef + intercept`).
+
+    link : {'auto', 'identity', 'log'}, default='auto'
+        The link function of the GLM, i.e. mapping from linear predictor
+        `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
+        the link depending on the chosen `power` parameter as follows:
+
+        - 'identity' for ``power <= 0``, e.g. for the Normal distribution
+        - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
+          distributions
+
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is a good choice for `n_samples` >> `n_features`, especially
+            with one-hot encoded categorical features with rare categories. Be aware
+            that the memory usage of this solver has a quadratic dependency on
+            `n_features` because it explicitly computes the Hessian matrix.
+
+            .. versionadded:: 1.2
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+        Values must be in the range `[1, inf)`.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+        Values must be in the range `(0.0, inf)`.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` .
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+        Values must be in the range `[0, inf)`.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PoissonRegressor : Generalized Linear Model with a Poisson distribution.
+    GammaRegressor : Generalized Linear Model with a Gamma distribution.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.TweedieRegressor()
+    >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
+    >>> y = [2, 3.5, 5, 5.5]
+    >>> clf.fit(X, y)
+    TweedieRegressor()
+    >>> clf.score(X, y)
+    0.839...
+    >>> clf.coef_
+    array([0.599..., 0.299...])
+    >>> clf.intercept_
+    1.600...
+    >>> clf.predict([[1, 1], [3, 4]])
+    array([2.500..., 4.599...])
+    """
+
+    _parameter_constraints: dict = {
+        **_GeneralizedLinearRegressor._parameter_constraints,
+        "power": [Interval(Real, None, None, closed="neither")],
+        "link": [StrOptions({"auto", "identity", "log"})],
+    }
+
+    def __init__(
+        self,
+        *,
+        power=0.0,
+        alpha=1.0,
+        fit_intercept=True,
+        link="auto",
+        solver="lbfgs",
+        max_iter=100,
+        tol=1e-4,
+        warm_start=False,
+        verbose=0,
+    ):
+        super().__init__(
+            alpha=alpha,
+            fit_intercept=fit_intercept,
+            solver=solver,
+            max_iter=max_iter,
+            tol=tol,
+            warm_start=warm_start,
+            verbose=verbose,
+        )
+        self.link = link
+        self.power = power
+
+    def _get_loss(self):
+        if self.link == "auto":
+            if self.power <= 0:
+                # identity link
+                return HalfTweedieLossIdentity(power=self.power)
+            else:
+                # log link
+                return HalfTweedieLoss(power=self.power)
+
+        if self.link == "log":
+            return HalfTweedieLoss(power=self.power)
+
+        if self.link == "identity":
+            return HalfTweedieLossIdentity(power=self.power)
@@ -0,0 +1,2 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
@@ -0,0 +1,352 @@
+# Authors: Manoj Kumar mks542@nyu.edu
+# License: BSD 3 clause
+
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import optimize
+
+from ..base import BaseEstimator, RegressorMixin, _fit_context
+from ..utils._mask import axis0_safe_slice
+from ..utils._param_validation import Interval
+from ..utils.extmath import safe_sparse_dot
+from ..utils.optimize import _check_optimize_result
+from ..utils.validation import _check_sample_weight
+from ._base import LinearModel
+
+
+def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None):
+    """Returns the Huber loss and the gradient.
+
+    Parameters
+    ----------
+    w : ndarray, shape (n_features + 1,) or (n_features + 2,)
+        Feature vector.
+        w[:n_features] gives the coefficients
+        w[-1] gives the scale factor and if the intercept is fit w[-2]
+        gives the intercept factor.
+
+    X : ndarray of shape (n_samples, n_features)
+        Input data.
+
+    y : ndarray of shape (n_samples,)
+        Target vector.
+
+    epsilon : float
+        Robustness of the Huber estimator.
+
+    alpha : float
+        Regularization parameter.
+
+    sample_weight : ndarray of shape (n_samples,), default=None
+        Weight assigned to each sample.
+
+    Returns
+    -------
+    loss : float
+        Huber loss.
+
+    gradient : ndarray, shape (len(w))
+        Returns the derivative of the Huber loss with respect to each
+        coefficient, intercept and the scale as a vector.
+    """
+    _, n_features = X.shape
+    fit_intercept = n_features + 2 == w.shape[0]
+    if fit_intercept:
+        intercept = w[-2]
+    sigma = w[-1]
+    w = w[:n_features]
+    n_samples = np.sum(sample_weight)
+
+    # Calculate the values where |y - X'w -c / sigma| > epsilon
+    # The values above this threshold are outliers.
+    linear_loss = y - safe_sparse_dot(X, w)
+    if fit_intercept:
+        linear_loss -= intercept
+    abs_linear_loss = np.abs(linear_loss)
+    outliers_mask = abs_linear_loss > epsilon * sigma
+
+    # Calculate the linear loss due to the outliers.
+    # This is equal to (2 * M * |y - X'w -c / sigma| - M**2) * sigma
+    outliers = abs_linear_loss[outliers_mask]
+    num_outliers = np.count_nonzero(outliers_mask)
+    n_non_outliers = X.shape[0] - num_outliers
+
+    # n_sq_outliers includes the weight give to the outliers while
+    # num_outliers is just the number of outliers.
+    outliers_sw = sample_weight[outliers_mask]
+    n_sw_outliers = np.sum(outliers_sw)
+    outlier_loss = (
+        2.0 * epsilon * np.sum(outliers_sw * outliers)
+        - sigma * n_sw_outliers * epsilon**2
+    )
+
+    # Calculate the quadratic loss due to the non-outliers.-
+    # This is equal to |(y - X'w - c)**2 / sigma**2| * sigma
+    non_outliers = linear_loss[~outliers_mask]
+    weighted_non_outliers = sample_weight[~outliers_mask] * non_outliers
+    weighted_loss = np.dot(weighted_non_outliers.T, non_outliers)
+    squared_loss = weighted_loss / sigma
+
+    if fit_intercept:
+        grad = np.zeros(n_features + 2)
+    else:
+        grad = np.zeros(n_features + 1)
+
+    # Gradient due to the squared loss.
+    X_non_outliers = -axis0_safe_slice(X, ~outliers_mask, n_non_outliers)
+    grad[:n_features] = (
+        2.0 / sigma * safe_sparse_dot(weighted_non_outliers, X_non_outliers)
+    )
+
+    # Gradient due to the linear loss.
+    signed_outliers = np.ones_like(outliers)
+    signed_outliers_mask = linear_loss[outliers_mask] < 0
+    signed_outliers[signed_outliers_mask] = -1.0
+    X_outliers = axis0_safe_slice(X, outliers_mask, num_outliers)
+    sw_outliers = sample_weight[outliers_mask] * signed_outliers
+    grad[:n_features] -= 2.0 * epsilon * (safe_sparse_dot(sw_outliers, X_outliers))
+
+    # Gradient due to the penalty.
+    grad[:n_features] += alpha * 2.0 * w
+
+    # Gradient due to sigma.
+    grad[-1] = n_samples
+    grad[-1] -= n_sw_outliers * epsilon**2
+    grad[-1] -= squared_loss / sigma
+
+    # Gradient due to the intercept.
+    if fit_intercept:
+        grad[-2] = -2.0 * np.sum(weighted_non_outliers) / sigma
+        grad[-2] -= 2.0 * epsilon * np.sum(sw_outliers)
+
+    loss = n_samples * sigma + squared_loss + outlier_loss
+    loss += alpha * np.dot(w, w)
+    return loss, grad
+
+
+class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
+    """L2-regularized linear regression model that is robust to outliers.
+
+    The Huber Regressor optimizes the squared loss for the samples where
+    ``|(y - Xw - c) / sigma| < epsilon`` and the absolute loss for the samples
+    where ``|(y - Xw - c) / sigma| > epsilon``, where the model coefficients
+    ``w``, the intercept ``c`` and the scale ``sigma`` are parameters
+    to be optimized. The parameter sigma makes sure that if y is scaled up
+    or down by a certain factor, one does not need to rescale epsilon to
+    achieve the same robustness. Note that this does not take into account
+    the fact that the different features of X may be of different scales.
+
+    The Huber loss function has the advantage of not being heavily influenced
+    by the outliers while not completely ignoring their effect.
+
+    Read more in the :ref:`User Guide <huber_regression>`
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    epsilon : float, default=1.35
+        The parameter epsilon controls the number of samples that should be
+        classified as outliers. The smaller the epsilon, the more robust it is
+        to outliers. Epsilon must be in the range `[1, inf)`.
+
+    max_iter : int, default=100
+        Maximum number of iterations that
+        ``scipy.optimize.minimize(method="L-BFGS-B")`` should run for.
+
+    alpha : float, default=0.0001
+        Strength of the squared L2 regularization. Note that the penalty is
+        equal to ``alpha * ||w||^2``.
+        Must be in the range `[0, inf)`.
+
+    warm_start : bool, default=False
+        This is useful if the stored attributes of a previously used model
+        has to be reused. If set to False, then the coefficients will
+        be rewritten for every call to fit.
+        See :term:`the Glossary <warm_start>`.
+
+    fit_intercept : bool, default=True
+        Whether or not to fit the intercept. This can be set to False
+        if the data is already centered around the origin.
+
+    tol : float, default=1e-05
+        The iteration will stop when
+        ``max{|proj g_i | i = 1, ..., n}`` <= ``tol``
+        where pg_i is the i-th component of the projected gradient.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Features got by optimizing the L2-regularized Huber loss.
+
+    intercept_ : float
+        Bias.
+
+    scale_ : float
+        The value by which ``|y - Xw - c|`` is scaled down.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of iterations that
+        ``scipy.optimize.minimize(method="L-BFGS-B")`` has run for.
+
+        .. versionchanged:: 0.20
+
+            In SciPy <= 1.0.0 the number of lbfgs iterations may exceed
+            ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.
+
+    outliers_ : array, shape (n_samples,)
+        A boolean mask which is set to True where the samples are identified
+        as outliers.
+
+    See Also
+    --------
+    RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
+    TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
+    SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
+
+    References
+    ----------
+    .. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics
+           Concomitant scale estimates, pg 172
+    .. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression.
+           https://statweb.stanford.edu/~owen/reports/hhu.pdf
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import HuberRegressor, LinearRegression
+    >>> from sklearn.datasets import make_regression
+    >>> rng = np.random.RandomState(0)
+    >>> X, y, coef = make_regression(
+    ...     n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0)
+    >>> X[:4] = rng.uniform(10, 20, (4, 2))
+    >>> y[:4] = rng.uniform(10, 20, 4)
+    >>> huber = HuberRegressor().fit(X, y)
+    >>> huber.score(X, y)
+    -7.284...
+    >>> huber.predict(X[:1,])
+    array([806.7200...])
+    >>> linear = LinearRegression().fit(X, y)
+    >>> print("True coefficients:", coef)
+    True coefficients: [20.4923...  34.1698...]
+    >>> print("Huber coefficients:", huber.coef_)
+    Huber coefficients: [17.7906... 31.0106...]
+    >>> print("Linear Regression coefficients:", linear.coef_)
+    Linear Regression coefficients: [-1.9221...  7.0226...]
+    """
+
+    _parameter_constraints: dict = {
+        "epsilon": [Interval(Real, 1.0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "warm_start": ["boolean"],
+        "fit_intercept": ["boolean"],
+        "tol": [Interval(Real, 0.0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        *,
+        epsilon=1.35,
+        max_iter=100,
+        alpha=0.0001,
+        warm_start=False,
+        fit_intercept=True,
+        tol=1e-05,
+    ):
+        self.epsilon = epsilon
+        self.max_iter = max_iter
+        self.alpha = alpha
+        self.warm_start = warm_start
+        self.fit_intercept = fit_intercept
+        self.tol = tol
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like, shape (n_samples,)
+            Target vector relative to X.
+
+        sample_weight : array-like, shape (n_samples,)
+            Weight given to each sample.
+
+        Returns
+        -------
+        self : object
+            Fitted `HuberRegressor` estimator.
+        """
+        X, y = self._validate_data(
+            X,
+            y,
+            copy=False,
+            accept_sparse=["csr"],
+            y_numeric=True,
+            dtype=[np.float64, np.float32],
+        )
+
+        sample_weight = _check_sample_weight(sample_weight, X)
+
+        if self.warm_start and hasattr(self, "coef_"):
+            parameters = np.concatenate((self.coef_, [self.intercept_, self.scale_]))
+        else:
+            if self.fit_intercept:
+                parameters = np.zeros(X.shape[1] + 2)
+            else:
+                parameters = np.zeros(X.shape[1] + 1)
+            # Make sure to initialize the scale parameter to a strictly
+            # positive value:
+            parameters[-1] = 1
+
+        # Sigma or the scale factor should be non-negative.
+        # Setting it to be zero might cause undefined bounds hence we set it
+        # to a value close to zero.
+        bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1))
+        bounds[-1][0] = np.finfo(np.float64).eps * 10
+
+        opt_res = optimize.minimize(
+            _huber_loss_and_gradient,
+            parameters,
+            method="L-BFGS-B",
+            jac=True,
+            args=(X, y, self.epsilon, self.alpha, sample_weight),
+            options={"maxiter": self.max_iter, "gtol": self.tol, "iprint": -1},
+            bounds=bounds,
+        )
+
+        parameters = opt_res.x
+
+        if opt_res.status == 2:
+            raise ValueError(
+                "HuberRegressor convergence failed: l-BFGS-b solver terminated with %s"
+                % opt_res.message
+            )
+        self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
+        self.scale_ = parameters[-1]
+        if self.fit_intercept:
+            self.intercept_ = parameters[-2]
+        else:
+            self.intercept_ = 0.0
+        self.coef_ = parameters[: X.shape[1]]
+
+        residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_)
+        self.outliers_ = residual > self.scale_ * self.epsilon
+        return self
@@ -0,0 +1,672 @@
+"""
+Loss functions for linear models with raw_prediction = X @ coef
+"""
+
+import numpy as np
+from scipy import sparse
+
+from ..utils.extmath import squared_norm
+
+
+class LinearModelLoss:
+    """General class for loss functions with raw_prediction = X @ coef + intercept.
+
+    Note that raw_prediction is also known as linear predictor.
+
+    The loss is the average of per sample losses and includes a term for L2
+    regularization::
+
+        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
+               + 1/2 * l2_reg_strength * ||coef||_2^2
+
+    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.
+
+    Gradient and hessian, for simplicity without intercept, are::
+
+        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
+        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
+                  + l2_reg_strength * identity
+
+    Conventions:
+        if fit_intercept:
+            n_dof =  n_features + 1
+        else:
+            n_dof = n_features
+
+        if base_loss.is_multiclass:
+            coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
+        else:
+            coef.shape = (n_dof,)
+
+        The intercept term is at the end of the coef array:
+        if base_loss.is_multiclass:
+            if coef.shape (n_classes, n_dof):
+                intercept = coef[:, -1]
+            if coef.shape (n_classes * n_dof,)
+                intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
+            intercept.shape = (n_classes,)
+        else:
+            intercept = coef[-1]
+
+    Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as
+
+        coef.reshape((n_classes, -1), order="F")
+
+    The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
+    coefficients without intercept, coef[:, :-1], contiguous and speeds up
+    matrix-vector computations.
+
+    Note: If the average loss per sample is wanted instead of the sum of the loss per
+    sample, one can simply use a rescaled sample_weight such that
+    sum(sample_weight) = 1.
+
+    Parameters
+    ----------
+    base_loss : instance of class BaseLoss from sklearn._loss.
+    fit_intercept : bool
+    """
+
+    def __init__(self, base_loss, fit_intercept):
+        self.base_loss = base_loss
+        self.fit_intercept = fit_intercept
+
+    def init_zero_coef(self, X, dtype=None):
+        """Allocate coef of correct shape with zeros.
+
+        Parameters:
+        -----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        dtype : data-type, default=None
+            Overrides the data type of coef. With dtype=None, coef will have the same
+            dtype as X.
+
+        Returns
+        -------
+        coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
+            Coefficients of a linear model.
+        """
+        n_features = X.shape[1]
+        n_classes = self.base_loss.n_classes
+        if self.fit_intercept:
+            n_dof = n_features + 1
+        else:
+            n_dof = n_features
+        if self.base_loss.is_multiclass:
+            coef = np.zeros_like(X, shape=(n_classes, n_dof), dtype=dtype, order="F")
+        else:
+            coef = np.zeros_like(X, shape=n_dof, dtype=dtype)
+        return coef
+
+    def weight_intercept(self, coef):
+        """Helper function to get coefficients and intercept.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+
+        Returns
+        -------
+        weights : ndarray of shape (n_features,) or (n_classes, n_features)
+            Coefficients without intercept term.
+        intercept : float or ndarray of shape (n_classes,)
+            Intercept terms.
+        """
+        if not self.base_loss.is_multiclass:
+            if self.fit_intercept:
+                intercept = coef[-1]
+                weights = coef[:-1]
+            else:
+                intercept = 0.0
+                weights = coef
+        else:
+            # reshape to (n_classes, n_dof)
+            if coef.ndim == 1:
+                weights = coef.reshape((self.base_loss.n_classes, -1), order="F")
+            else:
+                weights = coef
+            if self.fit_intercept:
+                intercept = weights[:, -1]
+                weights = weights[:, :-1]
+            else:
+                intercept = 0.0
+
+        return weights, intercept
+
+    def weight_intercept_raw(self, coef, X):
+        """Helper function to get coefficients, intercept and raw_prediction.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        Returns
+        -------
+        weights : ndarray of shape (n_features,) or (n_classes, n_features)
+            Coefficients without intercept term.
+        intercept : float or ndarray of shape (n_classes,)
+            Intercept terms.
+        raw_prediction : ndarray of shape (n_samples,) or \
+            (n_samples, n_classes)
+        """
+        weights, intercept = self.weight_intercept(coef)
+
+        if not self.base_loss.is_multiclass:
+            raw_prediction = X @ weights + intercept
+        else:
+            # weights has shape (n_classes, n_dof)
+            raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
+
+        return weights, intercept, raw_prediction
+
+    def l2_penalty(self, weights, l2_reg_strength):
+        """Compute L2 penalty term l2_reg_strength/2 *||w||_2^2."""
+        norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
+        return 0.5 * l2_reg_strength * norm2_w
+
+    def loss(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        raw_prediction=None,
+    ):
+        """Compute the loss as weighted average over point-wise losses.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
+
+        Returns
+        -------
+        loss : float
+            Weighted average of losses per sample, plus penalty.
+        """
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        loss = self.base_loss.loss(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=None,
+            n_threads=n_threads,
+        )
+        loss = np.average(loss, weights=sample_weight)
+
+        return loss + self.l2_penalty(weights, l2_reg_strength)
+
+    def loss_gradient(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        raw_prediction=None,
+    ):
+        """Computes the sum of loss and gradient w.r.t. coef.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
+
+        Returns
+        -------
+        loss : float
+            Weighted average of losses per sample, plus penalty.
+
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
+        """
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
+        n_dof = n_features + int(self.fit_intercept)
+
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        loss, grad_pointwise = self.base_loss.loss_gradient(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            n_threads=n_threads,
+        )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        loss = loss.sum() / sw_sum
+        loss += self.l2_penalty(weights, l2_reg_strength)
+
+        grad_pointwise /= sw_sum
+
+        if not self.base_loss.is_multiclass:
+            grad = np.empty_like(coef, dtype=weights.dtype)
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[-1] = grad_pointwise.sum()
+        else:
+            grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
+            # grad_pointwise.shape = (n_samples, n_classes)
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[:, -1] = grad_pointwise.sum(axis=0)
+            if coef.ndim == 1:
+                grad = grad.ravel(order="F")
+
+        return loss, grad
+
+    def gradient(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        raw_prediction=None,
+    ):
+        """Computes the gradient w.r.t. coef.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
+
+        Returns
+        -------
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
+        """
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
+        n_dof = n_features + int(self.fit_intercept)
+
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        grad_pointwise = self.base_loss.gradient(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            n_threads=n_threads,
+        )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        grad_pointwise /= sw_sum
+
+        if not self.base_loss.is_multiclass:
+            grad = np.empty_like(coef, dtype=weights.dtype)
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[-1] = grad_pointwise.sum()
+            return grad
+        else:
+            grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
+            # gradient.shape = (n_samples, n_classes)
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[:, -1] = grad_pointwise.sum(axis=0)
+            if coef.ndim == 1:
+                return grad.ravel(order="F")
+            else:
+                return grad
+
+    def gradient_hessian(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        gradient_out=None,
+        hessian_out=None,
+        raw_prediction=None,
+    ):
+        """Computes gradient and hessian w.r.t. coef.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+        gradient_out : None or ndarray of shape coef.shape
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        hessian_out : None or ndarray
+            A location into which the hessian is stored. If None, a new array
+            might be created.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
+
+        Returns
+        -------
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
+
+        hessian : ndarray
+            Hessian matrix.
+
+        hessian_warning : bool
+            True if pointwise hessian has more than half of its elements non-positive.
+        """
+        n_samples, n_features = X.shape
+        n_dof = n_features + int(self.fit_intercept)
+
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            n_threads=n_threads,
+        )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        grad_pointwise /= sw_sum
+        hess_pointwise /= sw_sum
+
+        # For non-canonical link functions and far away from the optimum, the pointwise
+        # hessian can be negative. We take care that 75% of the hessian entries are
+        # positive.
+        hessian_warning = np.mean(hess_pointwise <= 0) > 0.25
+        hess_pointwise = np.abs(hess_pointwise)
+
+        if not self.base_loss.is_multiclass:
+            # gradient
+            if gradient_out is None:
+                grad = np.empty_like(coef, dtype=weights.dtype)
+            else:
+                grad = gradient_out
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[-1] = grad_pointwise.sum()
+
+            # hessian
+            if hessian_out is None:
+                hess = np.empty(shape=(n_dof, n_dof), dtype=weights.dtype)
+            else:
+                hess = hessian_out
+
+            if hessian_warning:
+                # Exit early without computing the hessian.
+                return grad, hess, hessian_warning
+
+            # TODO: This "sandwich product", X' diag(W) X, is the main computational
+            # bottleneck for solvers. A dedicated Cython routine might improve it
+            # exploiting the symmetry (as opposed to, e.g., BLAS gemm).
+            if sparse.issparse(X):
+                hess[:n_features, :n_features] = (
+                    X.T
+                    @ sparse.dia_matrix(
+                        (hess_pointwise, 0), shape=(n_samples, n_samples)
+                    )
+                    @ X
+                ).toarray()
+            else:
+                # np.einsum may use less memory but the following, using BLAS matrix
+                # multiplication (gemm), is by far faster.
+                WX = hess_pointwise[:, None] * X
+                hess[:n_features, :n_features] = np.dot(X.T, WX)
+
+            if l2_reg_strength > 0:
+                # The L2 penalty enters the Hessian on the diagonal only. To add those
+                # terms, we use a flattened view on the array.
+                hess.reshape(-1)[
+                    : (n_features * n_dof) : (n_dof + 1)
+                ] += l2_reg_strength
+
+            if self.fit_intercept:
+                # With intercept included as added column to X, the hessian becomes
+                # hess = (X, 1)' @ diag(h) @ (X, 1)
+                #      = (X' @ diag(h) @ X, X' @ h)
+                #        (           h @ X, sum(h))
+                # The left upper part has already been filled, it remains to compute
+                # the last row and the last column.
+                Xh = X.T @ hess_pointwise
+                hess[:-1, -1] = Xh
+                hess[-1, :-1] = Xh
+                hess[-1, -1] = hess_pointwise.sum()
+        else:
+            # Here we may safely assume HalfMultinomialLoss aka categorical
+            # cross-entropy.
+            raise NotImplementedError
+
+        return grad, hess, hessian_warning
+
+    def gradient_hessian_product(
+        self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
+    ):
+        """Computes gradient and hessp (hessian product function) w.r.t. coef.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+
+        Returns
+        -------
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
+
+        hessp : callable
+            Function that takes in a vector input of shape of gradient and
+            and returns matrix-vector product with hessian.
+        """
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
+        n_dof = n_features + int(self.fit_intercept)
+        weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+
+        if not self.base_loss.is_multiclass:
+            grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
+                y_true=y,
+                raw_prediction=raw_prediction,
+                sample_weight=sample_weight,
+                n_threads=n_threads,
+            )
+            grad_pointwise /= sw_sum
+            hess_pointwise /= sw_sum
+            grad = np.empty_like(coef, dtype=weights.dtype)
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[-1] = grad_pointwise.sum()
+
+            # Precompute as much as possible: hX, hX_sum and hessian_sum
+            hessian_sum = hess_pointwise.sum()
+            if sparse.issparse(X):
+                hX = (
+                    sparse.dia_matrix((hess_pointwise, 0), shape=(n_samples, n_samples))
+                    @ X
+                )
+            else:
+                hX = hess_pointwise[:, np.newaxis] * X
+
+            if self.fit_intercept:
+                # Calculate the double derivative with respect to intercept.
+                # Note: In case hX is sparse, hX.sum is a matrix object.
+                hX_sum = np.squeeze(np.asarray(hX.sum(axis=0)))
+                # prevent squeezing to zero-dim array if n_features == 1
+                hX_sum = np.atleast_1d(hX_sum)
+
+            # With intercept included and l2_reg_strength = 0, hessp returns
+            # res = (X, 1)' @ diag(h) @ (X, 1) @ s
+            #     = (X, 1)' @ (hX @ s[:n_features], sum(h) * s[-1])
+            # res[:n_features] = X' @ hX @ s[:n_features] + sum(h) * s[-1]
+            # res[-1] = 1' @ hX @ s[:n_features] + sum(h) * s[-1]
+            def hessp(s):
+                ret = np.empty_like(s)
+                if sparse.issparse(X):
+                    ret[:n_features] = X.T @ (hX @ s[:n_features])
+                else:
+                    ret[:n_features] = np.linalg.multi_dot([X.T, hX, s[:n_features]])
+                ret[:n_features] += l2_reg_strength * s[:n_features]
+
+                if self.fit_intercept:
+                    ret[:n_features] += s[-1] * hX_sum
+                    ret[-1] = hX_sum @ s[:n_features] + hessian_sum * s[-1]
+                return ret
+
+        else:
+            # Here we may safely assume HalfMultinomialLoss aka categorical
+            # cross-entropy.
+            # HalfMultinomialLoss computes only the diagonal part of the hessian, i.e.
+            # diagonal in the classes. Here, we want the matrix-vector product of the
+            # full hessian. Therefore, we call gradient_proba.
+            grad_pointwise, proba = self.base_loss.gradient_proba(
+                y_true=y,
+                raw_prediction=raw_prediction,
+                sample_weight=sample_weight,
+                n_threads=n_threads,
+            )
+            grad_pointwise /= sw_sum
+            grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[:, -1] = grad_pointwise.sum(axis=0)
+
+            # Full hessian-vector product, i.e. not only the diagonal part of the
+            # hessian. Derivation with some index battle for input vector s:
+            #   - sample index i
+            #   - feature indices j, m
+            #   - class indices k, l
+            #   - 1_{k=l} is one if k=l else 0
+            #   - p_i_k is the (predicted) probability that sample i belongs to class k
+            #     for all i: sum_k p_i_k = 1
+            #   - s_l_m is input vector for class l and feature m
+            #   - X' = X transposed
+            #
+            # Note: Hessian with dropping most indices is just:
+            #       X' @ p_k (1(k=l) - p_l) @ X
+            #
+            # result_{k j} = sum_{i, l, m} Hessian_{i, k j, m l} * s_l_m
+            #   = sum_{i, l, m} (X')_{ji} * p_i_k * (1_{k=l} - p_i_l)
+            #                   * X_{im} s_l_m
+            #   = sum_{i, m} (X')_{ji} * p_i_k
+            #                * (X_{im} * s_k_m - sum_l p_i_l * X_{im} * s_l_m)
+            #
+            # See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411  # noqa
+            def hessp(s):
+                s = s.reshape((n_classes, -1), order="F")  # shape = (n_classes, n_dof)
+                if self.fit_intercept:
+                    s_intercept = s[:, -1]
+                    s = s[:, :-1]  # shape = (n_classes, n_features)
+                else:
+                    s_intercept = 0
+                tmp = X @ s.T + s_intercept  # X_{im} * s_k_m
+                tmp += (-proba * tmp).sum(axis=1)[:, np.newaxis]  # - sum_l ..
+                tmp *= proba  # * p_i_k
+                if sample_weight is not None:
+                    tmp *= sample_weight[:, np.newaxis]
+                # hess_prod = empty_like(grad), but we ravel grad below and this
+                # function is run after that.
+                hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
+                hess_prod[:, :n_features] = (tmp.T @ X) / sw_sum + l2_reg_strength * s
+                if self.fit_intercept:
+                    hess_prod[:, -1] = tmp.sum(axis=0) / sw_sum
+                if coef.ndim == 1:
+                    return hess_prod.ravel(order="F")
+                else:
+                    return hess_prod
+
+            if coef.ndim == 1:
+                return grad.ravel(order="F"), hessp
+
+        return grad, hessp
@@ -0,0 +1,575 @@
+# Authors: Rob Zinkov, Mathieu Blondel
+# License: BSD 3 clause
+from numbers import Real
+
+from ..base import _fit_context
+from ..utils._param_validation import Interval, StrOptions
+from ._stochastic_gradient import DEFAULT_EPSILON, BaseSGDClassifier, BaseSGDRegressor
+
+
+class PassiveAggressiveClassifier(BaseSGDClassifier):
+    """Passive Aggressive Classifier.
+
+    Read more in the :ref:`User Guide <passive_aggressive>`.
+
+    Parameters
+    ----------
+    C : float, default=1.0
+        Maximum step size (regularization). Defaults to 1.0.
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered.
+
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        :meth:`~sklearn.linear_model.PassiveAggressiveClassifier.partial_fit` method.
+
+        .. versionadded:: 0.19
+
+    tol : float or None, default=1e-3
+        The stopping criterion. If it is not None, the iterations will stop
+        when (loss > previous_loss - tol).
+
+        .. versionadded:: 0.19
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to True, it will automatically set aside
+        a stratified fraction of training data as validation and terminate
+        training when validation score is not improving by at least `tol` for
+        `n_iter_no_change` consecutive epochs.
+
+        .. versionadded:: 0.20
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True.
+
+        .. versionadded:: 0.20
+
+    n_iter_no_change : int, default=5
+        Number of iterations with no improvement to wait before early stopping.
+
+        .. versionadded:: 0.20
+
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    loss : str, default="hinge"
+        The loss function to be used:
+        hinge: equivalent to PA-I in the reference paper.
+        squared_hinge: equivalent to PA-II in the reference paper.
+
+    n_jobs : int or None, default=None
+        The number of CPUs to use to do the OVA (One Versus All, for
+        multi-class problems) computation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance, default=None
+        Used to shuffle the training data, when ``shuffle`` is set to
+        ``True``. Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+        Repeatedly calling fit or partial_fit when warm_start is True can
+        result in a different solution than when calling fit a single time
+        because of the way the data is shuffled.
+
+    class_weight : dict, {class_label: weight} or "balanced" or None, \
+            default=None
+        Preset for the class_weight fit parameter.
+
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+        .. versionadded:: 0.17
+           parameter *class_weight* to automatically weight samples.
+
+    average : bool or int, default=False
+        When set to True, computes the averaged SGD weights and stores the
+        result in the ``coef_`` attribute. If set to an int greater than 1,
+        averaging will begin once the total number of samples seen reaches
+        average. So average=10 will begin averaging after seeing 10 samples.
+
+        .. versionadded:: 0.19
+           parameter *average* to use weights averaging in SGD.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
+            (n_classes, n_features)
+        Weights assigned to the features.
+
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+        For multiclass fits, it is the maximum over every binary fit.
+
+    classes_ : ndarray of shape (n_classes,)
+        The unique classes labels.
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples + 1)``.
+
+    loss_function_ : callable
+        Loss function used by the algorithm.
+
+    See Also
+    --------
+    SGDClassifier : Incrementally trained logistic regression.
+    Perceptron : Linear perceptron classifier.
+
+    References
+    ----------
+    Online Passive-Aggressive Algorithms
+    <http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>
+    K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import PassiveAggressiveClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_features=4, random_state=0)
+    >>> clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,
+    ... tol=1e-3)
+    >>> clf.fit(X, y)
+    PassiveAggressiveClassifier(random_state=0)
+    >>> print(clf.coef_)
+    [[0.26642044 0.45070924 0.67251877 0.64185414]]
+    >>> print(clf.intercept_)
+    [1.84127814]
+    >>> print(clf.predict([[0, 0, 0, 0]]))
+    [1]
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSGDClassifier._parameter_constraints,
+        "loss": [StrOptions({"hinge", "squared_hinge"})],
+        "C": [Interval(Real, 0, None, closed="right")],
+    }
+
+    def __init__(
+        self,
+        *,
+        C=1.0,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        shuffle=True,
+        verbose=0,
+        loss="hinge",
+        n_jobs=None,
+        random_state=None,
+        warm_start=False,
+        class_weight=None,
+        average=False,
+    ):
+        super().__init__(
+            penalty=None,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            shuffle=shuffle,
+            verbose=verbose,
+            random_state=random_state,
+            eta0=1.0,
+            warm_start=warm_start,
+            class_weight=class_weight,
+            average=average,
+            n_jobs=n_jobs,
+        )
+
+        self.C = C
+        self.loss = loss
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y, classes=None):
+        """Fit linear model with Passive Aggressive algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Subset of the training data.
+
+        y : array-like of shape (n_samples,)
+            Subset of the target values.
+
+        classes : ndarray of shape (n_classes,)
+            Classes across all calls to partial_fit.
+            Can be obtained by via `np.unique(y_all)`, where y_all is the
+            target vector of the entire dataset.
+            This argument is required for the first call to partial_fit
+            and can be omitted in the subsequent calls.
+            Note that y doesn't need to contain all labels in `classes`.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        if not hasattr(self, "classes_"):
+            self._more_validate_params(for_partial_fit=True)
+
+            if self.class_weight == "balanced":
+                raise ValueError(
+                    "class_weight 'balanced' is not supported for "
+                    "partial_fit. For 'balanced' weights, use "
+                    "`sklearn.utils.compute_class_weight` with "
+                    "`class_weight='balanced'`. In place of y you "
+                    "can use a large enough subset of the full "
+                    "training set target to properly estimate the "
+                    "class frequency distributions. Pass the "
+                    "resulting weights as the class_weight "
+                    "parameter."
+                )
+
+        lr = "pa1" if self.loss == "hinge" else "pa2"
+        return self._partial_fit(
+            X,
+            y,
+            alpha=1.0,
+            C=self.C,
+            loss="hinge",
+            learning_rate=lr,
+            max_iter=1,
+            classes=classes,
+            sample_weight=None,
+            coef_init=None,
+            intercept_init=None,
+        )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, coef_init=None, intercept_init=None):
+        """Fit linear model with Passive Aggressive algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        coef_init : ndarray of shape (n_classes, n_features)
+            The initial coefficients to warm-start the optimization.
+
+        intercept_init : ndarray of shape (n_classes,)
+            The initial intercept to warm-start the optimization.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self._more_validate_params()
+
+        lr = "pa1" if self.loss == "hinge" else "pa2"
+        return self._fit(
+            X,
+            y,
+            alpha=1.0,
+            C=self.C,
+            loss="hinge",
+            learning_rate=lr,
+            coef_init=coef_init,
+            intercept_init=intercept_init,
+        )
+
+
+class PassiveAggressiveRegressor(BaseSGDRegressor):
+    """Passive Aggressive Regressor.
+
+    Read more in the :ref:`User Guide <passive_aggressive>`.
+
+    Parameters
+    ----------
+
+    C : float, default=1.0
+        Maximum step size (regularization). Defaults to 1.0.
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered. Defaults to True.
+
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        :meth:`~sklearn.linear_model.PassiveAggressiveRegressor.partial_fit` method.
+
+        .. versionadded:: 0.19
+
+    tol : float or None, default=1e-3
+        The stopping criterion. If it is not None, the iterations will stop
+        when (loss > previous_loss - tol).
+
+        .. versionadded:: 0.19
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation.
+        score is not improving. If set to True, it will automatically set aside
+        a fraction of training data as validation and terminate
+        training when validation score is not improving by at least tol for
+        n_iter_no_change consecutive epochs.
+
+        .. versionadded:: 0.20
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True.
+
+        .. versionadded:: 0.20
+
+    n_iter_no_change : int, default=5
+        Number of iterations with no improvement to wait before early stopping.
+
+        .. versionadded:: 0.20
+
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    loss : str, default="epsilon_insensitive"
+        The loss function to be used:
+        epsilon_insensitive: equivalent to PA-I in the reference paper.
+        squared_epsilon_insensitive: equivalent to PA-II in the reference
+        paper.
+
+    epsilon : float, default=0.1
+        If the difference between the current prediction and the correct label
+        is below this threshold, the model is not updated.
+
+    random_state : int, RandomState instance, default=None
+        Used to shuffle the training data, when ``shuffle`` is set to
+        ``True``. Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+        Repeatedly calling fit or partial_fit when warm_start is True can
+        result in a different solution than when calling fit a single time
+        because of the way the data is shuffled.
+
+    average : bool or int, default=False
+        When set to True, computes the averaged SGD weights and stores the
+        result in the ``coef_`` attribute. If set to an int greater than 1,
+        averaging will begin once the total number of samples seen reaches
+        average. So average=10 will begin averaging after seeing 10 samples.
+
+        .. versionadded:: 0.19
+           parameter *average* to use weights averaging in SGD.
+
+    Attributes
+    ----------
+    coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
+            n_features]
+        Weights assigned to the features.
+
+    intercept_ : array, shape = [1] if n_classes == 2 else [n_classes]
+        Constants in decision function.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples + 1)``.
+
+    See Also
+    --------
+    SGDRegressor : Linear model fitted by minimizing a regularized
+        empirical loss with SGD.
+
+    References
+    ----------
+    Online Passive-Aggressive Algorithms
+    <http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>
+    K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006).
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import PassiveAggressiveRegressor
+    >>> from sklearn.datasets import make_regression
+
+    >>> X, y = make_regression(n_features=4, random_state=0)
+    >>> regr = PassiveAggressiveRegressor(max_iter=100, random_state=0,
+    ... tol=1e-3)
+    >>> regr.fit(X, y)
+    PassiveAggressiveRegressor(max_iter=100, random_state=0)
+    >>> print(regr.coef_)
+    [20.48736655 34.18818427 67.59122734 87.94731329]
+    >>> print(regr.intercept_)
+    [-0.02306214]
+    >>> print(regr.predict([[0, 0, 0, 0]]))
+    [-0.02306214]
+    """
+
+    _parameter_constraints: dict = {
+        **BaseSGDRegressor._parameter_constraints,
+        "loss": [StrOptions({"epsilon_insensitive", "squared_epsilon_insensitive"})],
+        "C": [Interval(Real, 0, None, closed="right")],
+        "epsilon": [Interval(Real, 0, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        *,
+        C=1.0,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        shuffle=True,
+        verbose=0,
+        loss="epsilon_insensitive",
+        epsilon=DEFAULT_EPSILON,
+        random_state=None,
+        warm_start=False,
+        average=False,
+    ):
+        super().__init__(
+            penalty=None,
+            l1_ratio=0,
+            epsilon=epsilon,
+            eta0=1.0,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            shuffle=shuffle,
+            verbose=verbose,
+            random_state=random_state,
+            warm_start=warm_start,
+            average=average,
+        )
+        self.C = C
+        self.loss = loss
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y):
+        """Fit linear model with Passive Aggressive algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Subset of training data.
+
+        y : numpy array of shape [n_samples]
+            Subset of target values.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        if not hasattr(self, "coef_"):
+            self._more_validate_params(for_partial_fit=True)
+
+        lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
+        return self._partial_fit(
+            X,
+            y,
+            alpha=1.0,
+            C=self.C,
+            loss="epsilon_insensitive",
+            learning_rate=lr,
+            max_iter=1,
+            sample_weight=None,
+            coef_init=None,
+            intercept_init=None,
+        )
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, coef_init=None, intercept_init=None):
+        """Fit linear model with Passive Aggressive algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : numpy array of shape [n_samples]
+            Target values.
+
+        coef_init : array, shape = [n_features]
+            The initial coefficients to warm-start the optimization.
+
+        intercept_init : array, shape = [1]
+            The initial intercept to warm-start the optimization.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self._more_validate_params()
+
+        lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
+        return self._fit(
+            X,
+            y,
+            alpha=1.0,
+            C=self.C,
+            loss="epsilon_insensitive",
+            learning_rate=lr,
+            coef_init=coef_init,
+            intercept_init=intercept_init,
+        )
@@ -0,0 +1,229 @@
+# Author: Mathieu Blondel
+# License: BSD 3 clause
+from numbers import Real
+
+from ..utils._param_validation import Interval, StrOptions
+from ._stochastic_gradient import BaseSGDClassifier
+
+
+class Perceptron(BaseSGDClassifier):
+    """Linear perceptron classifier.
+
+    The implementation is a wrapper around :class:`~sklearn.linear_model.SGDClassifier`
+    by fixing the `loss` and `learning_rate` parameters as::
+
+        SGDClassifier(loss="perceptron", learning_rate="constant")
+
+    Other available parameters are described below and are forwarded to
+    :class:`~sklearn.linear_model.SGDClassifier`.
+
+    Read more in the :ref:`User Guide <perceptron>`.
+
+    Parameters
+    ----------
+
+    penalty : {'l2','l1','elasticnet'}, default=None
+        The penalty (aka regularization term) to be used.
+
+    alpha : float, default=0.0001
+        Constant that multiplies the regularization term if regularization is
+        used.
+
+    l1_ratio : float, default=0.15
+        The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`.
+        `l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1.
+        Only used if `penalty='elasticnet'`.
+
+        .. versionadded:: 0.24
+
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered.
+
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        :meth:`partial_fit` method.
+
+        .. versionadded:: 0.19
+
+    tol : float or None, default=1e-3
+        The stopping criterion. If it is not None, the iterations will stop
+        when (loss > previous_loss - tol).
+
+        .. versionadded:: 0.19
+
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    eta0 : float, default=1
+        Constant by which the updates are multiplied.
+
+    n_jobs : int, default=None
+        The number of CPUs to use to do the OVA (One Versus All, for
+        multi-class problems) computation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance or None, default=0
+        Used to shuffle the training data, when ``shuffle`` is set to
+        ``True``. Pass an int for reproducible output across multiple
+        function calls.
+        See :term:`Glossary <random_state>`.
+
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to True, it will automatically set aside
+        a stratified fraction of training data as validation and terminate
+        training when validation score is not improving by at least `tol` for
+        `n_iter_no_change` consecutive epochs.
+
+        .. versionadded:: 0.20
+
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True.
+
+        .. versionadded:: 0.20
+
+    n_iter_no_change : int, default=5
+        Number of iterations with no improvement to wait before early stopping.
+
+        .. versionadded:: 0.20
+
+    class_weight : dict, {class_label: weight} or "balanced", default=None
+        Preset for the class_weight fit parameter.
+
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution. See
+        :term:`the Glossary <warm_start>`.
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        The unique classes labels.
+
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
+            (n_classes, n_features)
+        Weights assigned to the features.
+
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
+        Constants in decision function.
+
+    loss_function_ : concrete LossFunction
+        The function that determines the loss, or difference between the
+        output of the algorithm and the target values.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+        For multiclass fits, it is the maximum over every binary fit.
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples + 1)``.
+
+    See Also
+    --------
+    sklearn.linear_model.SGDClassifier : Linear classifiers
+        (SVM, logistic regression, etc.) with SGD training.
+
+    Notes
+    -----
+    ``Perceptron`` is a classification algorithm which shares the same
+    underlying implementation with ``SGDClassifier``. In fact,
+    ``Perceptron()`` is equivalent to `SGDClassifier(loss="perceptron",
+    eta0=1, learning_rate="constant", penalty=None)`.
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Perceptron and references therein.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.linear_model import Perceptron
+    >>> X, y = load_digits(return_X_y=True)
+    >>> clf = Perceptron(tol=1e-3, random_state=0)
+    >>> clf.fit(X, y)
+    Perceptron()
+    >>> clf.score(X, y)
+    0.939...
+    """
+
+    _parameter_constraints: dict = {**BaseSGDClassifier._parameter_constraints}
+    _parameter_constraints.pop("loss")
+    _parameter_constraints.pop("average")
+    _parameter_constraints.update(
+        {
+            "penalty": [StrOptions({"l2", "l1", "elasticnet"}), None],
+            "alpha": [Interval(Real, 0, None, closed="left")],
+            "l1_ratio": [Interval(Real, 0, 1, closed="both")],
+            "eta0": [Interval(Real, 0, None, closed="left")],
+        }
+    )
+
+    def __init__(
+        self,
+        *,
+        penalty=None,
+        alpha=0.0001,
+        l1_ratio=0.15,
+        fit_intercept=True,
+        max_iter=1000,
+        tol=1e-3,
+        shuffle=True,
+        verbose=0,
+        eta0=1.0,
+        n_jobs=None,
+        random_state=0,
+        early_stopping=False,
+        validation_fraction=0.1,
+        n_iter_no_change=5,
+        class_weight=None,
+        warm_start=False,
+    ):
+        super().__init__(
+            loss="perceptron",
+            penalty=penalty,
+            alpha=alpha,
+            l1_ratio=l1_ratio,
+            fit_intercept=fit_intercept,
+            max_iter=max_iter,
+            tol=tol,
+            shuffle=shuffle,
+            verbose=verbose,
+            random_state=random_state,
+            learning_rate="constant",
+            eta0=eta0,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change,
+            power_t=0.5,
+            warm_start=warm_start,
+            class_weight=class_weight,
+            n_jobs=n_jobs,
+        )
@@ -0,0 +1,308 @@
+# Authors: David Dale <dale.david@mail.ru>
+#          Christian Lorentzen <lorentzen.ch@gmail.com>
+# License: BSD 3 clause
+import warnings
+from numbers import Real
+
+import numpy as np
+from scipy import sparse
+from scipy.optimize import linprog
+
+from ..base import BaseEstimator, RegressorMixin, _fit_context
+from ..exceptions import ConvergenceWarning
+from ..utils import _safe_indexing
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.fixes import parse_version, sp_version
+from ..utils.validation import _check_sample_weight
+from ._base import LinearModel
+
+
+class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):
+    """Linear regression model that predicts conditional quantiles.
+
+    The linear :class:`QuantileRegressor` optimizes the pinball loss for a
+    desired `quantile` and is robust to outliers.
+
+    This model uses an L1 regularization like
+    :class:`~sklearn.linear_model.Lasso`.
+
+    Read more in the :ref:`User Guide <quantile_regression>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    quantile : float, default=0.5
+        The quantile that the model tries to predict. It must be strictly
+        between 0 and 1. If 0.5 (default), the model predicts the 50%
+        quantile, i.e. the median.
+
+    alpha : float, default=1.0
+        Regularization constant that multiplies the L1 penalty term.
+
+    fit_intercept : bool, default=True
+        Whether or not to fit the intercept.
+
+    solver : {'highs-ds', 'highs-ipm', 'highs', 'interior-point', \
+            'revised simplex'}, default='highs'
+        Method used by :func:`scipy.optimize.linprog` to solve the linear
+        programming formulation.
+
+        From `scipy>=1.6.0`, it is recommended to use the highs methods because
+        they are the fastest ones. Solvers "highs-ds", "highs-ipm" and "highs"
+        support sparse input data and, in fact, always convert to sparse csc.
+
+        From `scipy>=1.11.0`, "interior-point" is not available anymore.
+
+        .. versionchanged:: 1.4
+           The default of `solver` changed to `"highs"` in version 1.4.
+
+    solver_options : dict, default=None
+        Additional parameters passed to :func:`scipy.optimize.linprog` as
+        options. If `None` and if `solver='interior-point'`, then
+        `{"lstsq": True}` is passed to :func:`scipy.optimize.linprog` for the
+        sake of stability.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the features.
+
+    intercept_ : float
+        The intercept of the model, aka bias term.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The actual number of iterations performed by the solver.
+
+    See Also
+    --------
+    Lasso : The Lasso is a linear model that estimates sparse coefficients
+        with l1 regularization.
+    HuberRegressor : Linear regression model that is robust to outliers.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import QuantileRegressor
+    >>> import numpy as np
+    >>> n_samples, n_features = 10, 2
+    >>> rng = np.random.RandomState(0)
+    >>> y = rng.randn(n_samples)
+    >>> X = rng.randn(n_samples, n_features)
+    >>> # the two following lines are optional in practice
+    >>> from sklearn.utils.fixes import sp_version, parse_version
+    >>> solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point"
+    >>> reg = QuantileRegressor(quantile=0.8, solver=solver).fit(X, y)
+    >>> np.mean(y <= reg.predict(X))
+    0.8
+    """
+
+    _parameter_constraints: dict = {
+        "quantile": [Interval(Real, 0, 1, closed="neither")],
+        "alpha": [Interval(Real, 0, None, closed="left")],
+        "fit_intercept": ["boolean"],
+        "solver": [
+            StrOptions(
+                {
+                    "highs-ds",
+                    "highs-ipm",
+                    "highs",
+                    "interior-point",
+                    "revised simplex",
+                }
+            ),
+        ],
+        "solver_options": [dict, None],
+    }
+
+    def __init__(
+        self,
+        *,
+        quantile=0.5,
+        alpha=1.0,
+        fit_intercept=True,
+        solver="highs",
+        solver_options=None,
+    ):
+        self.quantile = quantile
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.solver = solver
+        self.solver_options = solver_options
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        X, y = self._validate_data(
+            X,
+            y,
+            accept_sparse=["csc", "csr", "coo"],
+            y_numeric=True,
+            multi_output=False,
+        )
+        sample_weight = _check_sample_weight(sample_weight, X)
+
+        n_features = X.shape[1]
+        n_params = n_features
+
+        if self.fit_intercept:
+            n_params += 1
+            # Note that centering y and X with _preprocess_data does not work
+            # for quantile regression.
+
+        # The objective is defined as 1/n * sum(pinball loss) + alpha * L1.
+        # So we rescale the penalty term, which is equivalent.
+        alpha = np.sum(sample_weight) * self.alpha
+
+        if self.solver in (
+            "highs-ds",
+            "highs-ipm",
+            "highs",
+        ) and sp_version < parse_version("1.6.0"):
+            raise ValueError(
+                f"Solver {self.solver} is only available "
+                f"with scipy>=1.6.0, got {sp_version}"
+            )
+        else:
+            solver = self.solver
+
+        if solver == "interior-point" and sp_version >= parse_version("1.11.0"):
+            raise ValueError(
+                f"Solver {solver} is not anymore available in SciPy >= 1.11.0."
+            )
+
+        if sparse.issparse(X) and solver not in ["highs", "highs-ds", "highs-ipm"]:
+            raise ValueError(
+                f"Solver {self.solver} does not support sparse X. "
+                "Use solver 'highs' for example."
+            )
+        # make default solver more stable
+        if self.solver_options is None and solver == "interior-point":
+            solver_options = {"lstsq": True}
+        else:
+            solver_options = self.solver_options
+
+        # After rescaling alpha, the minimization problem is
+        #     min sum(pinball loss) + alpha * L1
+        # Use linear programming formulation of quantile regression
+        #     min_x c x
+        #           A_eq x = b_eq
+        #                0 <= x
+        # x = (s0, s, t0, t, u, v) = slack variables >= 0
+        # intercept = s0 - t0
+        # coef = s - t
+        # c = (0, alpha * 1_p, 0, alpha * 1_p, quantile * 1_n, (1-quantile) * 1_n)
+        # residual = y - X@coef - intercept = u - v
+        # A_eq = (1_n, X, -1_n, -X, diag(1_n), -diag(1_n))
+        # b_eq = y
+        # p = n_features
+        # n = n_samples
+        # 1_n = vector of length n with entries equal one
+        # see https://stats.stackexchange.com/questions/384909/
+        #
+        # Filtering out zero sample weights from the beginning makes life
+        # easier for the linprog solver.
+        indices = np.nonzero(sample_weight)[0]
+        n_indices = len(indices)  # use n_mask instead of n_samples
+        if n_indices < len(sample_weight):
+            sample_weight = sample_weight[indices]
+            X = _safe_indexing(X, indices)
+            y = _safe_indexing(y, indices)
+        c = np.concatenate(
+            [
+                np.full(2 * n_params, fill_value=alpha),
+                sample_weight * self.quantile,
+                sample_weight * (1 - self.quantile),
+            ]
+        )
+        if self.fit_intercept:
+            # do not penalize the intercept
+            c[0] = 0
+            c[n_params] = 0
+
+        if solver in ["highs", "highs-ds", "highs-ipm"]:
+            # Note that highs methods always use a sparse CSC memory layout internally,
+            # even for optimization problems parametrized using dense numpy arrays.
+            # Therefore, we work with CSC matrices as early as possible to limit
+            # unnecessary repeated memory copies.
+            eye = sparse.eye(n_indices, dtype=X.dtype, format="csc")
+            if self.fit_intercept:
+                ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype))
+                A_eq = sparse.hstack([ones, X, -ones, -X, eye, -eye], format="csc")
+            else:
+                A_eq = sparse.hstack([X, -X, eye, -eye], format="csc")
+        else:
+            eye = np.eye(n_indices)
+            if self.fit_intercept:
+                ones = np.ones((n_indices, 1))
+                A_eq = np.concatenate([ones, X, -ones, -X, eye, -eye], axis=1)
+            else:
+                A_eq = np.concatenate([X, -X, eye, -eye], axis=1)
+
+        b_eq = y
+
+        result = linprog(
+            c=c,
+            A_eq=A_eq,
+            b_eq=b_eq,
+            method=solver,
+            options=solver_options,
+        )
+        solution = result.x
+        if not result.success:
+            failure = {
+                1: "Iteration limit reached.",
+                2: "Problem appears to be infeasible.",
+                3: "Problem appears to be unbounded.",
+                4: "Numerical difficulties encountered.",
+            }
+            warnings.warn(
+                "Linear programming for QuantileRegressor did not succeed.\n"
+                f"Status is {result.status}: "
+                + failure.setdefault(result.status, "unknown reason")
+                + "\n"
+                + "Result message of linprog:\n"
+                + result.message,
+                ConvergenceWarning,
+            )
+
+        # positive slack - negative slack
+        # solution is an array with (params_pos, params_neg, u, v)
+        params = solution[:n_params] - solution[n_params : 2 * n_params]
+
+        self.n_iter_ = result.nit
+
+        if self.fit_intercept:
+            self.coef_ = params[1:]
+            self.intercept_ = params[0]
+        else:
+            self.coef_ = params
+            self.intercept_ = 0.0
+        return self
@@ -0,0 +1,726 @@
+# Author: Johannes Schönberger
+#
+# License: BSD 3 clause
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import (
+    BaseEstimator,
+    MetaEstimatorMixin,
+    MultiOutputMixin,
+    RegressorMixin,
+    _fit_context,
+    clone,
+)
+from ..exceptions import ConvergenceWarning
+from ..utils import check_consistent_length, check_random_state
+from ..utils._bunch import Bunch
+from ..utils._param_validation import (
+    HasMethods,
+    Interval,
+    Options,
+    RealNotInt,
+    StrOptions,
+)
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
+from ..utils.random import sample_without_replacement
+from ..utils.validation import (
+    _check_method_params,
+    _check_sample_weight,
+    _deprecate_positional_args,
+    check_is_fitted,
+    has_fit_parameter,
+)
+from ._base import LinearRegression
+
+_EPSILON = np.spacing(1)
+
+
+def _dynamic_max_trials(n_inliers, n_samples, min_samples, probability):
+    """Determine number trials such that at least one outlier-free subset is
+    sampled for the given inlier/outlier ratio.
+
+    Parameters
+    ----------
+    n_inliers : int
+        Number of inliers in the data.
+
+    n_samples : int
+        Total number of samples in the data.
+
+    min_samples : int
+        Minimum number of samples chosen randomly from original data.
+
+    probability : float
+        Probability (confidence) that one outlier-free sample is generated.
+
+    Returns
+    -------
+    trials : int
+        Number of trials.
+
+    """
+    inlier_ratio = n_inliers / float(n_samples)
+    nom = max(_EPSILON, 1 - probability)
+    denom = max(_EPSILON, 1 - inlier_ratio**min_samples)
+    if nom == 1:
+        return 0
+    if denom == 1:
+        return float("inf")
+    return abs(float(np.ceil(np.log(nom) / np.log(denom))))
+
+
+class RANSACRegressor(
+    MetaEstimatorMixin,
+    RegressorMixin,
+    MultiOutputMixin,
+    BaseEstimator,
+):
+    """RANSAC (RANdom SAmple Consensus) algorithm.
+
+    RANSAC is an iterative algorithm for the robust estimation of parameters
+    from a subset of inliers from the complete data set.
+
+    Read more in the :ref:`User Guide <ransac_regression>`.
+
+    Parameters
+    ----------
+    estimator : object, default=None
+        Base estimator object which implements the following methods:
+
+         * `fit(X, y)`: Fit model to given training data and target values.
+         * `score(X, y)`: Returns the mean accuracy on the given test data,
+           which is used for the stop criterion defined by `stop_score`.
+           Additionally, the score is used to decide which of two equally
+           large consensus sets is chosen as the better one.
+         * `predict(X)`: Returns predicted values using the linear model,
+           which is used to compute residual error using loss function.
+
+        If `estimator` is None, then
+        :class:`~sklearn.linear_model.LinearRegression` is used for
+        target values of dtype float.
+
+        Note that the current implementation only supports regression
+        estimators.
+
+    min_samples : int (>= 1) or float ([0, 1]), default=None
+        Minimum number of samples chosen randomly from original data. Treated
+        as an absolute number of samples for `min_samples >= 1`, treated as a
+        relative number `ceil(min_samples * X.shape[0])` for
+        `min_samples < 1`. This is typically chosen as the minimal number of
+        samples necessary to estimate the given `estimator`. By default a
+        :class:`~sklearn.linear_model.LinearRegression` estimator is assumed and
+        `min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly
+        dependent upon the model, so if a `estimator` other than
+        :class:`~sklearn.linear_model.LinearRegression` is used, the user must
+        provide a value.
+
+    residual_threshold : float, default=None
+        Maximum residual for a data sample to be classified as an inlier.
+        By default the threshold is chosen as the MAD (median absolute
+        deviation) of the target values `y`. Points whose residuals are
+        strictly equal to the threshold are considered as inliers.
+
+    is_data_valid : callable, default=None
+        This function is called with the randomly selected data before the
+        model is fitted to it: `is_data_valid(X, y)`. If its return value is
+        False the current randomly chosen sub-sample is skipped.
+
+    is_model_valid : callable, default=None
+        This function is called with the estimated model and the randomly
+        selected data: `is_model_valid(model, X, y)`. If its return value is
+        False the current randomly chosen sub-sample is skipped.
+        Rejecting samples with this function is computationally costlier than
+        with `is_data_valid`. `is_model_valid` should therefore only be used if
+        the estimated model is needed for making the rejection decision.
+
+    max_trials : int, default=100
+        Maximum number of iterations for random sample selection.
+
+    max_skips : int, default=np.inf
+        Maximum number of iterations that can be skipped due to finding zero
+        inliers or invalid data defined by ``is_data_valid`` or invalid models
+        defined by ``is_model_valid``.
+
+        .. versionadded:: 0.19
+
+    stop_n_inliers : int, default=np.inf
+        Stop iteration if at least this number of inliers are found.
+
+    stop_score : float, default=np.inf
+        Stop iteration if score is greater equal than this threshold.
+
+    stop_probability : float in range [0, 1], default=0.99
+        RANSAC iteration stops if at least one outlier-free set of the training
+        data is sampled in RANSAC. This requires to generate at least N
+        samples (iterations)::
+
+            N >= log(1 - probability) / log(1 - e**m)
+
+        where the probability (confidence) is typically set to high value such
+        as 0.99 (the default) and e is the current fraction of inliers w.r.t.
+        the total number of samples.
+
+    loss : str, callable, default='absolute_error'
+        String inputs, 'absolute_error' and 'squared_error' are supported which
+        find the absolute error and squared error per sample respectively.
+
+        If ``loss`` is a callable, then it should be a function that takes
+        two arrays as inputs, the true and predicted value and returns a 1-D
+        array with the i-th value of the array corresponding to the loss
+        on ``X[i]``.
+
+        If the loss on a sample is greater than the ``residual_threshold``,
+        then this sample is classified as an outlier.
+
+        .. versionadded:: 0.18
+
+    random_state : int, RandomState instance, default=None
+        The generator used to initialize the centers.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    estimator_ : object
+        Best fitted model (copy of the `estimator` object).
+
+    n_trials_ : int
+        Number of random selection trials until one of the stop criteria is
+        met. It is always ``<= max_trials``.
+
+    inlier_mask_ : bool array of shape [n_samples]
+        Boolean mask of inliers classified as ``True``.
+
+    n_skips_no_inliers_ : int
+        Number of iterations skipped due to finding zero inliers.
+
+        .. versionadded:: 0.19
+
+    n_skips_invalid_data_ : int
+        Number of iterations skipped due to invalid data defined by
+        ``is_data_valid``.
+
+        .. versionadded:: 0.19
+
+    n_skips_invalid_model_ : int
+        Number of iterations skipped due to an invalid model defined by
+        ``is_model_valid``.
+
+        .. versionadded:: 0.19
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    HuberRegressor : Linear regression model that is robust to outliers.
+    TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
+    SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/RANSAC
+    .. [2] https://www.sri.com/wp-content/uploads/2021/12/ransac-publication.pdf
+    .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import RANSACRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(
+    ...     n_samples=200, n_features=2, noise=4.0, random_state=0)
+    >>> reg = RANSACRegressor(random_state=0).fit(X, y)
+    >>> reg.score(X, y)
+    0.9885...
+    >>> reg.predict(X[:1,])
+    array([-31.9417...])
+    """  # noqa: E501
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods(["fit", "score", "predict"]), None],
+        "min_samples": [
+            Interval(Integral, 1, None, closed="left"),
+            Interval(RealNotInt, 0, 1, closed="both"),
+            None,
+        ],
+        "residual_threshold": [Interval(Real, 0, None, closed="left"), None],
+        "is_data_valid": [callable, None],
+        "is_model_valid": [callable, None],
+        "max_trials": [
+            Interval(Integral, 0, None, closed="left"),
+            Options(Real, {np.inf}),
+        ],
+        "max_skips": [
+            Interval(Integral, 0, None, closed="left"),
+            Options(Real, {np.inf}),
+        ],
+        "stop_n_inliers": [
+            Interval(Integral, 0, None, closed="left"),
+            Options(Real, {np.inf}),
+        ],
+        "stop_score": [Interval(Real, None, None, closed="both")],
+        "stop_probability": [Interval(Real, 0, 1, closed="both")],
+        "loss": [StrOptions({"absolute_error", "squared_error"}), callable],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        estimator=None,
+        *,
+        min_samples=None,
+        residual_threshold=None,
+        is_data_valid=None,
+        is_model_valid=None,
+        max_trials=100,
+        max_skips=np.inf,
+        stop_n_inliers=np.inf,
+        stop_score=np.inf,
+        stop_probability=0.99,
+        loss="absolute_error",
+        random_state=None,
+    ):
+        self.estimator = estimator
+        self.min_samples = min_samples
+        self.residual_threshold = residual_threshold
+        self.is_data_valid = is_data_valid
+        self.is_model_valid = is_model_valid
+        self.max_trials = max_trials
+        self.max_skips = max_skips
+        self.stop_n_inliers = stop_n_inliers
+        self.stop_score = stop_score
+        self.stop_probability = stop_probability
+        self.random_state = random_state
+        self.loss = loss
+
+    @_fit_context(
+        # RansacRegressor.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    # TODO(1.7): remove `sample_weight` from the signature after deprecation
+    # cycle; for backwards compatibility: pop it from `fit_params` before the
+    # `_raise_for_params` check and reinsert it after the check
+    @_deprecate_positional_args(version="1.7")
+    def fit(self, X, y, *, sample_weight=None, **fit_params):
+        """Fit estimator using RANSAC algorithm.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Individual weights for each sample
+            raises error if sample_weight is passed and estimator
+            fit method does not support it.
+
+            .. versionadded:: 0.18
+
+        **fit_params : dict
+            Parameters routed to the `fit` method of the sub-estimator via the
+            metadata routing API.
+
+            .. versionadded:: 1.5
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        self : object
+            Fitted `RANSACRegressor` estimator.
+
+        Raises
+        ------
+        ValueError
+            If no valid consensus set could be found. This occurs if
+            `is_data_valid` and `is_model_valid` return False for all
+            `max_trials` randomly chosen sub-samples.
+        """
+        # Need to validate separately here. We can't pass multi_output=True
+        # because that would allow y to be csr. Delay expensive finiteness
+        # check to the estimator's own input validation.
+        _raise_for_params(fit_params, self, "fit")
+        check_X_params = dict(accept_sparse="csr", force_all_finite=False)
+        check_y_params = dict(ensure_2d=False)
+        X, y = self._validate_data(
+            X, y, validate_separately=(check_X_params, check_y_params)
+        )
+        check_consistent_length(X, y)
+
+        if self.estimator is not None:
+            estimator = clone(self.estimator)
+        else:
+            estimator = LinearRegression()
+
+        if self.min_samples is None:
+            if not isinstance(estimator, LinearRegression):
+                raise ValueError(
+                    "`min_samples` needs to be explicitly set when estimator "
+                    "is not a LinearRegression."
+                )
+            min_samples = X.shape[1] + 1
+        elif 0 < self.min_samples < 1:
+            min_samples = np.ceil(self.min_samples * X.shape[0])
+        elif self.min_samples >= 1:
+            min_samples = self.min_samples
+        if min_samples > X.shape[0]:
+            raise ValueError(
+                "`min_samples` may not be larger than number "
+                "of samples: n_samples = %d." % (X.shape[0])
+            )
+
+        if self.residual_threshold is None:
+            # MAD (median absolute deviation)
+            residual_threshold = np.median(np.abs(y - np.median(y)))
+        else:
+            residual_threshold = self.residual_threshold
+
+        if self.loss == "absolute_error":
+            if y.ndim == 1:
+                loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
+            else:
+                loss_function = lambda y_true, y_pred: np.sum(
+                    np.abs(y_true - y_pred), axis=1
+                )
+        elif self.loss == "squared_error":
+            if y.ndim == 1:
+                loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2
+            else:
+                loss_function = lambda y_true, y_pred: np.sum(
+                    (y_true - y_pred) ** 2, axis=1
+                )
+
+        elif callable(self.loss):
+            loss_function = self.loss
+
+        random_state = check_random_state(self.random_state)
+
+        try:  # Not all estimator accept a random_state
+            estimator.set_params(random_state=random_state)
+        except ValueError:
+            pass
+
+        estimator_fit_has_sample_weight = has_fit_parameter(estimator, "sample_weight")
+        estimator_name = type(estimator).__name__
+        if sample_weight is not None and not estimator_fit_has_sample_weight:
+            raise ValueError(
+                "%s does not support sample_weight. Sample"
+                " weights are only used for the calibration"
+                " itself." % estimator_name
+            )
+
+        if sample_weight is not None:
+            fit_params["sample_weight"] = sample_weight
+
+        if _routing_enabled():
+            routed_params = process_routing(self, "fit", **fit_params)
+        else:
+            routed_params = Bunch()
+            routed_params.estimator = Bunch(fit={}, predict={}, score={})
+            if sample_weight is not None:
+                sample_weight = _check_sample_weight(sample_weight, X)
+                routed_params.estimator.fit = {"sample_weight": sample_weight}
+
+        n_inliers_best = 1
+        score_best = -np.inf
+        inlier_mask_best = None
+        X_inlier_best = None
+        y_inlier_best = None
+        inlier_best_idxs_subset = None
+        self.n_skips_no_inliers_ = 0
+        self.n_skips_invalid_data_ = 0
+        self.n_skips_invalid_model_ = 0
+
+        # number of data samples
+        n_samples = X.shape[0]
+        sample_idxs = np.arange(n_samples)
+
+        self.n_trials_ = 0
+        max_trials = self.max_trials
+        while self.n_trials_ < max_trials:
+            self.n_trials_ += 1
+
+            if (
+                self.n_skips_no_inliers_
+                + self.n_skips_invalid_data_
+                + self.n_skips_invalid_model_
+            ) > self.max_skips:
+                break
+
+            # choose random sample set
+            subset_idxs = sample_without_replacement(
+                n_samples, min_samples, random_state=random_state
+            )
+            X_subset = X[subset_idxs]
+            y_subset = y[subset_idxs]
+
+            # check if random sample set is valid
+            if self.is_data_valid is not None and not self.is_data_valid(
+                X_subset, y_subset
+            ):
+                self.n_skips_invalid_data_ += 1
+                continue
+
+            # cut `fit_params` down to `subset_idxs`
+            fit_params_subset = _check_method_params(
+                X, params=routed_params.estimator.fit, indices=subset_idxs
+            )
+
+            # fit model for current random sample set
+            estimator.fit(X_subset, y_subset, **fit_params_subset)
+
+            # check if estimated model is valid
+            if self.is_model_valid is not None and not self.is_model_valid(
+                estimator, X_subset, y_subset
+            ):
+                self.n_skips_invalid_model_ += 1
+                continue
+
+            # residuals of all data for current random sample model
+            y_pred = estimator.predict(X)
+            residuals_subset = loss_function(y, y_pred)
+
+            # classify data into inliers and outliers
+            inlier_mask_subset = residuals_subset <= residual_threshold
+            n_inliers_subset = np.sum(inlier_mask_subset)
+
+            # less inliers -> skip current random sample
+            if n_inliers_subset < n_inliers_best:
+                self.n_skips_no_inliers_ += 1
+                continue
+
+            # extract inlier data set
+            inlier_idxs_subset = sample_idxs[inlier_mask_subset]
+            X_inlier_subset = X[inlier_idxs_subset]
+            y_inlier_subset = y[inlier_idxs_subset]
+
+            # cut `fit_params` down to `inlier_idxs_subset`
+            score_params_inlier_subset = _check_method_params(
+                X, params=routed_params.estimator.score, indices=inlier_idxs_subset
+            )
+
+            # score of inlier data set
+            score_subset = estimator.score(
+                X_inlier_subset,
+                y_inlier_subset,
+                **score_params_inlier_subset,
+            )
+
+            # same number of inliers but worse score -> skip current random
+            # sample
+            if n_inliers_subset == n_inliers_best and score_subset < score_best:
+                continue
+
+            # save current random sample as best sample
+            n_inliers_best = n_inliers_subset
+            score_best = score_subset
+            inlier_mask_best = inlier_mask_subset
+            X_inlier_best = X_inlier_subset
+            y_inlier_best = y_inlier_subset
+            inlier_best_idxs_subset = inlier_idxs_subset
+
+            max_trials = min(
+                max_trials,
+                _dynamic_max_trials(
+                    n_inliers_best, n_samples, min_samples, self.stop_probability
+                ),
+            )
+
+            # break if sufficient number of inliers or score is reached
+            if n_inliers_best >= self.stop_n_inliers or score_best >= self.stop_score:
+                break
+
+        # if none of the iterations met the required criteria
+        if inlier_mask_best is None:
+            if (
+                self.n_skips_no_inliers_
+                + self.n_skips_invalid_data_
+                + self.n_skips_invalid_model_
+            ) > self.max_skips:
+                raise ValueError(
+                    "RANSAC skipped more iterations than `max_skips` without"
+                    " finding a valid consensus set. Iterations were skipped"
+                    " because each randomly chosen sub-sample failed the"
+                    " passing criteria. See estimator attributes for"
+                    " diagnostics (n_skips*)."
+                )
+            else:
+                raise ValueError(
+                    "RANSAC could not find a valid consensus set. All"
+                    " `max_trials` iterations were skipped because each"
+                    " randomly chosen sub-sample failed the passing criteria."
+                    " See estimator attributes for diagnostics (n_skips*)."
+                )
+        else:
+            if (
+                self.n_skips_no_inliers_
+                + self.n_skips_invalid_data_
+                + self.n_skips_invalid_model_
+            ) > self.max_skips:
+                warnings.warn(
+                    (
+                        "RANSAC found a valid consensus set but exited"
+                        " early due to skipping more iterations than"
+                        " `max_skips`. See estimator attributes for"
+                        " diagnostics (n_skips*)."
+                    ),
+                    ConvergenceWarning,
+                )
+
+        # estimate final model using all inliers
+        fit_params_best_idxs_subset = _check_method_params(
+            X, params=routed_params.estimator.fit, indices=inlier_best_idxs_subset
+        )
+
+        estimator.fit(X_inlier_best, y_inlier_best, **fit_params_best_idxs_subset)
+
+        self.estimator_ = estimator
+        self.inlier_mask_ = inlier_mask_best
+        return self
+
+    def predict(self, X, **params):
+        """Predict using the estimated model.
+
+        This is a wrapper for `estimator_.predict(X)`.
+
+        Parameters
+        ----------
+        X : {array-like or sparse matrix} of shape (n_samples, n_features)
+            Input data.
+
+        **params : dict
+            Parameters routed to the `predict` method of the sub-estimator via
+            the metadata routing API.
+
+            .. versionadded:: 1.5
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        y : array, shape = [n_samples] or [n_samples, n_targets]
+            Returns predicted values.
+        """
+        check_is_fitted(self)
+        X = self._validate_data(
+            X,
+            force_all_finite=False,
+            accept_sparse=True,
+            reset=False,
+        )
+
+        _raise_for_params(params, self, "predict")
+
+        if _routing_enabled():
+            predict_params = process_routing(self, "predict", **params).estimator[
+                "predict"
+            ]
+        else:
+            predict_params = {}
+
+        return self.estimator_.predict(X, **predict_params)
+
+    def score(self, X, y, **params):
+        """Return the score of the prediction.
+
+        This is a wrapper for `estimator_.score(X, y)`.
+
+        Parameters
+        ----------
+        X : (array-like or sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+
+        **params : dict
+            Parameters routed to the `score` method of the sub-estimator via
+            the metadata routing API.
+
+            .. versionadded:: 1.5
+
+                Only available if
+                `sklearn.set_config(enable_metadata_routing=True)` is set. See
+                :ref:`Metadata Routing User Guide <metadata_routing>` for more
+                details.
+
+        Returns
+        -------
+        z : float
+            Score of the prediction.
+        """
+        check_is_fitted(self)
+        X = self._validate_data(
+            X,
+            force_all_finite=False,
+            accept_sparse=True,
+            reset=False,
+        )
+
+        _raise_for_params(params, self, "score")
+        if _routing_enabled():
+            score_params = process_routing(self, "score", **params).estimator["score"]
+        else:
+            score_params = {}
+
+        return self.estimator_.score(X, y, **score_params)
+
+    def get_metadata_routing(self):
+        """Get metadata routing of this object.
+
+        Please check :ref:`User Guide <metadata_routing>` on how the routing
+        mechanism works.
+
+        .. versionadded:: 1.5
+
+        Returns
+        -------
+        routing : MetadataRouter
+            A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
+            routing information.
+        """
+        router = MetadataRouter(owner=self.__class__.__name__).add(
+            estimator=self.estimator,
+            method_mapping=MethodMapping()
+            .add(caller="fit", callee="fit")
+            .add(caller="fit", callee="score")
+            .add(caller="score", callee="score")
+            .add(caller="predict", callee="predict"),
+        )
+        return router
+
+    def _more_tags(self):
+        return {
+            "_xfail_checks": {
+                "check_sample_weights_invariance": (
+                    "zero sample_weight is not equivalent to removing samples"
+                ),
+            }
+        }
@@ -0,0 +1,371 @@
+"""Solvers for Ridge and LogisticRegression using SAG algorithm"""
+
+# Authors: Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
+#
+# License: BSD 3 clause
+
+import warnings
+
+import numpy as np
+
+from ..exceptions import ConvergenceWarning
+from ..utils import check_array
+from ..utils.extmath import row_norms
+from ..utils.validation import _check_sample_weight
+from ._base import make_dataset
+from ._sag_fast import sag32, sag64
+
+
+def get_auto_step_size(
+    max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=None, is_saga=False
+):
+    """Compute automatic step size for SAG solver.
+
+    The step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is
+    the max sum of squares for over all samples.
+
+    Parameters
+    ----------
+    max_squared_sum : float
+        Maximum squared sum of X over samples.
+
+    alpha_scaled : float
+        Constant that multiplies the regularization term, scaled by
+        1. / n_samples, the number of samples.
+
+    loss : {'log', 'squared', 'multinomial'}
+        The loss function used in SAG solver.
+
+    fit_intercept : bool
+        Specifies if a constant (a.k.a. bias or intercept) will be
+        added to the decision function.
+
+    n_samples : int, default=None
+        Number of rows in X. Useful if is_saga=True.
+
+    is_saga : bool, default=False
+        Whether to return step size for the SAGA algorithm or the SAG
+        algorithm.
+
+    Returns
+    -------
+    step_size : float
+        Step size used in SAG solver.
+
+    References
+    ----------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+
+    :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
+    "SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives" <1407.0202>`
+    """
+    if loss in ("log", "multinomial"):
+        L = 0.25 * (max_squared_sum + int(fit_intercept)) + alpha_scaled
+    elif loss == "squared":
+        # inverse Lipschitz constant for squared loss
+        L = max_squared_sum + int(fit_intercept) + alpha_scaled
+    else:
+        raise ValueError(
+            "Unknown loss function for SAG solver, got %s instead of 'log' or 'squared'"
+            % loss
+        )
+    if is_saga:
+        # SAGA theoretical step size is 1/3L or 1 / (2 * (L + mu n))
+        # See Defazio et al. 2014
+        mun = min(2 * n_samples * alpha_scaled, L)
+        step = 1.0 / (2 * L + mun)
+    else:
+        # SAG theoretical step size is 1/16L but it is recommended to use 1 / L
+        # see http://www.birs.ca//workshops//2014/14w5003/files/schmidt.pdf,
+        # slide 65
+        step = 1.0 / L
+    return step
+
+
+def sag_solver(
+    X,
+    y,
+    sample_weight=None,
+    loss="log",
+    alpha=1.0,
+    beta=0.0,
+    max_iter=1000,
+    tol=0.001,
+    verbose=0,
+    random_state=None,
+    check_input=True,
+    max_squared_sum=None,
+    warm_start_mem=None,
+    is_saga=False,
+):
+    """SAG solver for Ridge and LogisticRegression.
+
+    SAG stands for Stochastic Average Gradient: the gradient of the loss is
+    estimated each sample at a time and the model is updated along the way with
+    a constant learning rate.
+
+    IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the
+    same scale. You can normalize the data by using
+    sklearn.preprocessing.StandardScaler on your data before passing it to the
+    fit method.
+
+    This implementation works with data represented as dense numpy arrays or
+    sparse scipy arrays of floating point values for the features. It will
+    fit the data according to squared loss or log loss.
+
+    The regularizer is a penalty added to the loss function that shrinks model
+    parameters towards the zero vector using the squared euclidean norm L2.
+
+    .. versionadded:: 0.17
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Training data.
+
+    y : ndarray of shape (n_samples,)
+        Target values. With loss='multinomial', y must be label encoded
+        (see preprocessing.LabelEncoder).
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Weights applied to individual samples (1. for unweighted).
+
+    loss : {'log', 'squared', 'multinomial'}, default='log'
+        Loss function that will be optimized:
+        -'log' is the binary logistic loss, as used in LogisticRegression.
+        -'squared' is the squared loss, as used in Ridge.
+        -'multinomial' is the multinomial logistic loss, as used in
+         LogisticRegression.
+
+        .. versionadded:: 0.18
+           *loss='multinomial'*
+
+    alpha : float, default=1.
+        L2 regularization term in the objective function
+        ``(0.5 * alpha * || W ||_F^2)``.
+
+    beta : float, default=0.
+        L1 regularization term in the objective function
+        ``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True.
+
+    max_iter : int, default=1000
+        The max number of passes over the training data if the stopping
+        criteria is not reached.
+
+    tol : float, default=0.001
+        The stopping criteria for the weights. The iterations will stop when
+        max(change in weights) / max(weights) < tol.
+
+    verbose : int, default=0
+        The verbosity level.
+
+    random_state : int, RandomState instance or None, default=None
+        Used when shuffling the data. Pass an int for reproducible output
+        across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    check_input : bool, default=True
+        If False, the input arrays X and y will not be checked.
+
+    max_squared_sum : float, default=None
+        Maximum squared sum of X over samples. If None, it will be computed,
+        going through all the samples. The value should be precomputed
+        to speed up cross validation.
+
+    warm_start_mem : dict, default=None
+        The initialization parameters used for warm starting. Warm starting is
+        currently used in LogisticRegression but not in Ridge.
+        It contains:
+            - 'coef': the weight vector, with the intercept in last line
+                if the intercept is fitted.
+            - 'gradient_memory': the scalar gradient for all seen samples.
+            - 'sum_gradient': the sum of gradient over all seen samples,
+                for each feature.
+            - 'intercept_sum_gradient': the sum of gradient over all seen
+                samples, for the intercept.
+            - 'seen': array of boolean describing the seen samples.
+            - 'num_seen': the number of seen samples.
+
+    is_saga : bool, default=False
+        Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves
+        better in the first epochs, and allow for l1 regularisation.
+
+    Returns
+    -------
+    coef_ : ndarray of shape (n_features,)
+        Weight vector.
+
+    n_iter_ : int
+        The number of full pass on all samples.
+
+    warm_start_mem : dict
+        Contains a 'coef' key with the fitted result, and possibly the
+        fitted intercept at the end of the array. Contains also other keys
+        used for warm starting.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import linear_model
+    >>> n_samples, n_features = 10, 5
+    >>> rng = np.random.RandomState(0)
+    >>> X = rng.randn(n_samples, n_features)
+    >>> y = rng.randn(n_samples)
+    >>> clf = linear_model.Ridge(solver='sag')
+    >>> clf.fit(X, y)
+    Ridge(solver='sag')
+
+    >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>> y = np.array([1, 1, 2, 2])
+    >>> clf = linear_model.LogisticRegression(solver='sag')
+    >>> clf.fit(X, y)
+    LogisticRegression(solver='sag')
+
+    References
+    ----------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+
+    :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
+    "SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives" <1407.0202>`
+
+    See Also
+    --------
+    Ridge, SGDRegressor, ElasticNet, Lasso, SVR,
+    LogisticRegression, SGDClassifier, LinearSVC, Perceptron
+    """
+    if warm_start_mem is None:
+        warm_start_mem = {}
+    # Ridge default max_iter is None
+    if max_iter is None:
+        max_iter = 1000
+
+    if check_input:
+        _dtype = [np.float64, np.float32]
+        X = check_array(X, dtype=_dtype, accept_sparse="csr", order="C")
+        y = check_array(y, dtype=_dtype, ensure_2d=False, order="C")
+
+    n_samples, n_features = X.shape[0], X.shape[1]
+    # As in SGD, the alpha is scaled by n_samples.
+    alpha_scaled = float(alpha) / n_samples
+    beta_scaled = float(beta) / n_samples
+
+    # if loss == 'multinomial', y should be label encoded.
+    n_classes = int(y.max()) + 1 if loss == "multinomial" else 1
+
+    # initialization
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+    if "coef" in warm_start_mem.keys():
+        coef_init = warm_start_mem["coef"]
+    else:
+        # assume fit_intercept is False
+        coef_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C")
+
+    # coef_init contains possibly the intercept_init at the end.
+    # Note that Ridge centers the data before fitting, so fit_intercept=False.
+    fit_intercept = coef_init.shape[0] == (n_features + 1)
+    if fit_intercept:
+        intercept_init = coef_init[-1, :]
+        coef_init = coef_init[:-1, :]
+    else:
+        intercept_init = np.zeros(n_classes, dtype=X.dtype)
+
+    if "intercept_sum_gradient" in warm_start_mem.keys():
+        intercept_sum_gradient = warm_start_mem["intercept_sum_gradient"]
+    else:
+        intercept_sum_gradient = np.zeros(n_classes, dtype=X.dtype)
+
+    if "gradient_memory" in warm_start_mem.keys():
+        gradient_memory_init = warm_start_mem["gradient_memory"]
+    else:
+        gradient_memory_init = np.zeros(
+            (n_samples, n_classes), dtype=X.dtype, order="C"
+        )
+    if "sum_gradient" in warm_start_mem.keys():
+        sum_gradient_init = warm_start_mem["sum_gradient"]
+    else:
+        sum_gradient_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C")
+
+    if "seen" in warm_start_mem.keys():
+        seen_init = warm_start_mem["seen"]
+    else:
+        seen_init = np.zeros(n_samples, dtype=np.int32, order="C")
+
+    if "num_seen" in warm_start_mem.keys():
+        num_seen_init = warm_start_mem["num_seen"]
+    else:
+        num_seen_init = 0
+
+    dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state)
+
+    if max_squared_sum is None:
+        max_squared_sum = row_norms(X, squared=True).max()
+    step_size = get_auto_step_size(
+        max_squared_sum,
+        alpha_scaled,
+        loss,
+        fit_intercept,
+        n_samples=n_samples,
+        is_saga=is_saga,
+    )
+    if step_size * alpha_scaled == 1:
+        raise ZeroDivisionError(
+            "Current sag implementation does not handle "
+            "the case step_size * alpha_scaled == 1"
+        )
+
+    sag = sag64 if X.dtype == np.float64 else sag32
+    num_seen, n_iter_ = sag(
+        dataset,
+        coef_init,
+        intercept_init,
+        n_samples,
+        n_features,
+        n_classes,
+        tol,
+        max_iter,
+        loss,
+        step_size,
+        alpha_scaled,
+        beta_scaled,
+        sum_gradient_init,
+        gradient_memory_init,
+        seen_init,
+        num_seen_init,
+        fit_intercept,
+        intercept_sum_gradient,
+        intercept_decay,
+        is_saga,
+        verbose,
+    )
+
+    if n_iter_ == max_iter:
+        warnings.warn(
+            "The max_iter was reached which means the coef_ did not converge",
+            ConvergenceWarning,
+        )
+
+    if fit_intercept:
+        coef_init = np.vstack((coef_init, intercept_init))
+
+    warm_start_mem = {
+        "coef": coef_init,
+        "sum_gradient": sum_gradient_init,
+        "intercept_sum_gradient": intercept_sum_gradient,
+        "gradient_memory": gradient_memory_init,
+        "seen": seen_init,
+        "num_seen": num_seen,
+    }
+
+    if loss == "multinomial":
+        coef_ = coef_init.T
+    else:
+        coef_ = coef_init[:, 0]
+
+    return coef_, n_iter_, warm_start_mem
@@ -0,0 +1,842 @@
+{{py:
+
+"""
+
+Template file for easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: sag_fast.pyx
+
+Each class is duplicated for all dtypes (float and double). The keywords
+between double braces are substituted in setup.py.
+
+Authors: Danny Sullivan <dbsullivan23@gmail.com>
+         Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
+         Arthur Mensch <arthur.mensch@m4x.org
+         Arthur Imbert <arthurimbert05@gmail.com>
+         Joan Massich <mailsik@gmail.com>
+
+License: BSD 3 clause
+"""
+
+# name_suffix, c_type, np_type
+dtypes = [('64', 'double', 'np.float64'),
+          ('32', 'float', 'np.float32')]
+
+}}
+"""SAG and SAGA implementation"""
+
+import numpy as np
+from libc.math cimport exp, fabs, isfinite, log
+from libc.time cimport time, time_t
+
+from ._sgd_fast cimport LossFunction
+from ._sgd_fast cimport Log, SquaredLoss
+
+from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
+
+from libc.stdio cimport printf
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef inline {{c_type}} fmax{{name_suffix}}({{c_type}} x, {{c_type}} y) noexcept nogil:
+    if x > y:
+        return x
+    return y
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef {{c_type}} _logsumexp{{name_suffix}}({{c_type}}* arr, int n_classes) noexcept nogil:
+    """Computes the sum of arr assuming arr is in the log domain.
+
+    Returns log(sum(exp(arr))) while minimizing the possibility of
+    over/underflow.
+    """
+    # Use the max to normalize, as with the log this is what accumulates
+    # the less errors
+    cdef {{c_type}} vmax = arr[0]
+    cdef {{c_type}} out = 0.0
+    cdef int i
+
+    for i in range(1, n_classes):
+        if vmax < arr[i]:
+            vmax = arr[i]
+
+    for i in range(n_classes):
+        out += exp(arr[i] - vmax)
+
+    return log(out) + vmax
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef class MultinomialLogLoss{{name_suffix}}:
+    cdef {{c_type}} _loss(self, {{c_type}} y, {{c_type}}* prediction, int n_classes,
+                      {{c_type}} sample_weight) noexcept nogil:
+        r"""Multinomial Logistic regression loss.
+
+        The multinomial logistic loss for one sample is:
+        loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction))
+             = sw (logsumexp(prediction) - prediction[y])
+
+        where:
+            prediction = dot(x_sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Parameters
+        ----------
+        y : {{c_type}}, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded).
+
+        prediction : pointer to a np.ndarray[{{c_type}}] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : {{c_type}}
+            Weight of current sample.
+
+        Returns
+        -------
+        loss : {{c_type}}
+            Multinomial loss for current sample.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef {{c_type}} logsumexp_prediction = _logsumexp{{name_suffix}}(prediction, n_classes)
+        cdef {{c_type}} loss
+
+        # y is the indice of the correct class of current sample.
+        loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight
+        return loss
+
+    cdef void dloss(self, {{c_type}} y, {{c_type}}* prediction, int n_classes,
+                     {{c_type}} sample_weight, {{c_type}}* gradient_ptr) noexcept nogil:
+        r"""Multinomial Logistic regression gradient of the loss.
+
+        The gradient of the multinomial logistic loss with respect to a class c,
+        and for one sample is:
+        grad_c = - sw * (p[c] - \delta_{y,c})
+
+        where:
+            p[c] = exp(logsumexp(prediction) - prediction[c])
+            prediction = dot(sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Note that to obtain the true gradient, this value has to be multiplied
+        by the sample vector x.
+
+        Parameters
+        ----------
+        prediction : pointer to a np.ndarray[{{c_type}}] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        y : {{c_type}}, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded)
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : {{c_type}}
+            Weight of current sample.
+
+        gradient_ptr : pointer to a np.ndarray[{{c_type}}] of shape (n_classes,)
+            Gradient vector to be filled.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef {{c_type}} logsumexp_prediction = _logsumexp{{name_suffix}}(prediction, n_classes)
+        cdef int class_ind
+
+        for class_ind in range(n_classes):
+            gradient_ptr[class_ind] = exp(prediction[class_ind] -
+                                          logsumexp_prediction)
+
+            # y is the indice of the correct class of current sample.
+            if class_ind == y:
+                gradient_ptr[class_ind] -= 1.0
+
+            gradient_ptr[class_ind] *= sample_weight
+
+    def __reduce__(self):
+        return MultinomialLogLoss{{name_suffix}}, ()
+
+{{endfor}}
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef inline {{c_type}} _soft_thresholding{{name_suffix}}({{c_type}} x, {{c_type}} shrinkage) noexcept nogil:
+    return fmax{{name_suffix}}(x - shrinkage, 0) - fmax{{name_suffix}}(- x - shrinkage, 0)
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+def sag{{name_suffix}}(
+    SequentialDataset{{name_suffix}} dataset,
+    {{c_type}}[:, ::1] weights_array,
+    {{c_type}}[::1] intercept_array,
+    int n_samples,
+    int n_features,
+    int n_classes,
+    double tol,
+    int max_iter,
+    str loss_function,
+    double step_size,
+    double alpha,
+    double beta,
+    {{c_type}}[:, ::1] sum_gradient_init,
+    {{c_type}}[:, ::1] gradient_memory_init,
+    bint[::1] seen_init,
+    int num_seen,
+    bint fit_intercept,
+    {{c_type}}[::1] intercept_sum_gradient_init,
+    double intercept_decay,
+    bint saga,
+    bint verbose
+):
+    """Stochastic Average Gradient (SAG) and SAGA solvers.
+
+    Used in Ridge and LogisticRegression.
+
+    Some implementation details:
+
+    - Just-in-time (JIT) update: In SAG(A), the average-gradient update is
+    collinear with the drawn sample X_i. Therefore, if the data is sparse, the
+    random sample X_i will change the average gradient only on features j where
+    X_ij != 0. In some cases, the average gradient on feature j might change
+    only after k random samples with no change. In these cases, instead of
+    applying k times the same gradient step on feature j, we apply the gradient
+    step only once, scaled by k. This is called the "just-in-time update", and
+    it is performed in `lagged_update{{name_suffix}}`. This function also
+    applies the proximal operator after the gradient step (if L1 regularization
+    is used in SAGA).
+
+    - Weight scale: In SAG(A), the weights are scaled down at each iteration
+    due to the L2 regularization. To avoid updating all the weights at each
+    iteration, the weight scale is factored out in a separate variable `wscale`
+    which is only used in the JIT update. When this variable is too small, it
+    is reset for numerical stability using the function
+    `scale_weights{{name_suffix}}`. This reset requires applying all remaining
+    JIT updates. This reset is also performed every `n_samples` iterations
+    before each convergence check, so when the algorithm stops, we are sure
+    that there is no remaining JIT updates.
+
+    Reference
+    ---------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+    (section 4.3)
+
+    :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
+    "SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives" <1407.0202>`
+    """
+    # the data pointer for x, the current sample
+    cdef {{c_type}} *x_data_ptr = NULL
+    # the index pointer for the column of the data
+    cdef int *x_ind_ptr = NULL
+    # the number of non-zero features for current sample
+    cdef int xnnz = -1
+    # the label value for current sample
+    # the label value for current sample
+    cdef {{c_type}} y
+    # the sample weight
+    cdef {{c_type}} sample_weight
+
+    # helper variable for indexes
+    cdef int f_idx, s_idx, feature_ind, class_ind, j
+    # the number of pass through all samples
+    cdef int n_iter = 0
+    # helper to track iterations through samples
+    cdef int sample_itr
+    # the index (row number) of the current sample
+    cdef int sample_ind
+
+    # the maximum change in weights, used to compute stopping criteria
+    cdef {{c_type}} max_change
+    # a holder variable for the max weight, used to compute stopping criteria
+    cdef {{c_type}} max_weight
+
+    # the start time of the fit
+    cdef time_t start_time
+    # the end time of the fit
+    cdef time_t end_time
+
+    # precomputation since the step size does not change in this implementation
+    cdef {{c_type}} wscale_update = 1.0 - step_size * alpha
+
+    # helper for cumulative sum
+    cdef {{c_type}} cum_sum
+
+    # the pointer to the coef_ or weights
+    cdef {{c_type}}* weights = &weights_array[0, 0]
+
+    # the sum of gradients for each feature
+    cdef {{c_type}}* sum_gradient = &sum_gradient_init[0, 0]
+
+    # the previously seen gradient for each sample
+    cdef {{c_type}}* gradient_memory = &gradient_memory_init[0, 0]
+
+    # the cumulative sums needed for JIT params
+    cdef {{c_type}}[::1] cumulative_sums = np.empty(n_samples, dtype={{np_type}}, order="c")
+
+    # the index for the last time this feature was updated
+    cdef int[::1] feature_hist = np.zeros(n_features, dtype=np.int32, order="c")
+
+    # the previous weights to use to compute stopping criteria
+    cdef {{c_type}}[:, ::1] previous_weights_array = np.zeros((n_features, n_classes), dtype={{np_type}}, order="c")
+    cdef {{c_type}}* previous_weights = &previous_weights_array[0, 0]
+
+    cdef {{c_type}}[::1] prediction = np.zeros(n_classes, dtype={{np_type}}, order="c")
+
+    cdef {{c_type}}[::1] gradient = np.zeros(n_classes, dtype={{np_type}}, order="c")
+
+    # Intermediate variable that need declaration since cython cannot infer when templating
+    cdef {{c_type}} val
+
+    # Bias correction term in saga
+    cdef {{c_type}} gradient_correction
+
+    # the scalar used for multiplying z
+    cdef {{c_type}} wscale = 1.0
+
+    # return value (-1 if an error occurred, 0 otherwise)
+    cdef int status = 0
+
+    # the cumulative sums for each iteration for the sparse implementation
+    cumulative_sums[0] = 0.0
+
+    # the multipliative scale needed for JIT params
+    cdef {{c_type}}[::1] cumulative_sums_prox
+    cdef {{c_type}}* cumulative_sums_prox_ptr
+
+    cdef bint prox = beta > 0 and saga
+
+    # Loss function to optimize
+    cdef LossFunction loss
+    # Whether the loss function is multinomial
+    cdef bint multinomial = False
+    # Multinomial loss function
+    cdef MultinomialLogLoss{{name_suffix}} multiloss
+
+    if loss_function == "multinomial":
+        multinomial = True
+        multiloss = MultinomialLogLoss{{name_suffix}}()
+    elif loss_function == "log":
+        loss = Log()
+    elif loss_function == "squared":
+        loss = SquaredLoss()
+    else:
+        raise ValueError("Invalid loss parameter: got %s instead of "
+                         "one of ('log', 'squared', 'multinomial')"
+                         % loss_function)
+
+    if prox:
+        cumulative_sums_prox = np.empty(n_samples, dtype={{np_type}}, order="c")
+        cumulative_sums_prox_ptr = &cumulative_sums_prox[0]
+    else:
+        cumulative_sums_prox = None
+        cumulative_sums_prox_ptr = NULL
+
+    with nogil:
+        start_time = time(NULL)
+        for n_iter in range(max_iter):
+            for sample_itr in range(n_samples):
+                # extract a random sample
+                sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, &y, &sample_weight)
+
+                # cached index for gradient_memory
+                s_idx = sample_ind * n_classes
+
+                # update the number of samples seen and the seen array
+                if seen_init[sample_ind] == 0:
+                    num_seen += 1
+                    seen_init[sample_ind] = 1
+
+                # make the weight updates (just-in-time gradient step, and prox operator)
+                if sample_itr > 0:
+                   status = lagged_update{{name_suffix}}(
+                       weights=weights,
+                       wscale=wscale,
+                       xnnz=xnnz,
+                       n_samples=n_samples,
+                       n_classes=n_classes,
+                       sample_itr=sample_itr,
+                       cumulative_sums=&cumulative_sums[0],
+                       cumulative_sums_prox=cumulative_sums_prox_ptr,
+                       feature_hist=&feature_hist[0],
+                       prox=prox,
+                       sum_gradient=sum_gradient,
+                       x_ind_ptr=x_ind_ptr,
+                       reset=False,
+                       n_iter=n_iter
+                   )
+                   if status == -1:
+                       break
+
+                # find the current prediction
+                predict_sample{{name_suffix}}(
+                    x_data_ptr=x_data_ptr,
+                    x_ind_ptr=x_ind_ptr,
+                    xnnz=xnnz,
+                    w_data_ptr=weights,
+                    wscale=wscale,
+                    intercept=&intercept_array[0],
+                    prediction=&prediction[0],
+                    n_classes=n_classes
+                )
+
+                # compute the gradient for this sample, given the prediction
+                if multinomial:
+                    multiloss.dloss(y, &prediction[0], n_classes, sample_weight, &gradient[0])
+                else:
+                    gradient[0] = loss.dloss(y, prediction[0]) * sample_weight
+
+                # L2 regularization by simply rescaling the weights
+                wscale *= wscale_update
+
+                # make the updates to the sum of gradients
+                for j in range(xnnz):
+                    feature_ind = x_ind_ptr[j]
+                    val = x_data_ptr[j]
+                    f_idx = feature_ind * n_classes
+                    for class_ind in range(n_classes):
+                        gradient_correction = \
+                            val * (gradient[class_ind] -
+                                   gradient_memory[s_idx + class_ind])
+                        if saga:
+                            # Note that this is not the main gradient step,
+                            # which is performed just-in-time in lagged_update.
+                            # This part is done outside the JIT update
+                            # as it does not depend on the average gradient.
+                            # The prox operator is applied after the JIT update
+                            weights[f_idx + class_ind] -= \
+                                (gradient_correction * step_size
+                                 * (1 - 1. / num_seen) / wscale)
+                        sum_gradient[f_idx + class_ind] += gradient_correction
+
+                # fit the intercept
+                if fit_intercept:
+                    for class_ind in range(n_classes):
+                        gradient_correction = (gradient[class_ind] -
+                                               gradient_memory[s_idx + class_ind])
+                        intercept_sum_gradient_init[class_ind] += gradient_correction
+                        gradient_correction *= step_size * (1. - 1. / num_seen)
+                        if saga:
+                            intercept_array[class_ind] -= \
+                                (step_size * intercept_sum_gradient_init[class_ind] /
+                                 num_seen * intercept_decay) + gradient_correction
+                        else:
+                            intercept_array[class_ind] -= \
+                                (step_size * intercept_sum_gradient_init[class_ind] /
+                                 num_seen * intercept_decay)
+
+                        # check to see that the intercept is not inf or NaN
+                        if not isfinite(intercept_array[class_ind]):
+                            status = -1
+                            break
+                    # Break from the n_samples outer loop if an error happened
+                    # in the fit_intercept n_classes inner loop
+                    if status == -1:
+                        break
+
+                # update the gradient memory for this sample
+                for class_ind in range(n_classes):
+                    gradient_memory[s_idx + class_ind] = gradient[class_ind]
+
+                if sample_itr == 0:
+                    cumulative_sums[0] = step_size / (wscale * num_seen)
+                    if prox:
+                        cumulative_sums_prox[0] = step_size * beta / wscale
+                else:
+                    cumulative_sums[sample_itr] = \
+                        (cumulative_sums[sample_itr - 1] +
+                         step_size / (wscale * num_seen))
+                    if prox:
+                        cumulative_sums_prox[sample_itr] = \
+                        (cumulative_sums_prox[sample_itr - 1] +
+                             step_size * beta / wscale)
+                # If wscale gets too small, we need to reset the scale.
+                # This also resets the just-in-time update system.
+                if wscale < 1e-9:
+                    if verbose:
+                        with gil:
+                            print("rescaling...")
+                    status = scale_weights{{name_suffix}}(
+                        weights=weights,
+                        wscale=&wscale,
+                        n_features=n_features,
+                        n_samples=n_samples,
+                        n_classes=n_classes,
+                        sample_itr=sample_itr,
+                        cumulative_sums=&cumulative_sums[0],
+                        cumulative_sums_prox=cumulative_sums_prox_ptr,
+                        feature_hist=&feature_hist[0],
+                        prox=prox,
+                        sum_gradient=sum_gradient,
+                        n_iter=n_iter
+                    )
+                    if status == -1:
+                        break
+
+            # Break from the n_iter outer loop if an error happened in the
+            # n_samples inner loop
+            if status == -1:
+                break
+
+            # We scale the weights every n_samples iterations and reset the
+            # just-in-time update system for numerical stability.
+            # Because this reset is done before every convergence check, we are
+            # sure there is no remaining lagged update when the algorithm stops.
+            status = scale_weights{{name_suffix}}(
+                weights=weights,
+                wscale=&wscale,
+                n_features=n_features,
+                n_samples=n_samples,
+                n_classes=n_classes,
+                sample_itr=n_samples - 1,
+                cumulative_sums=&cumulative_sums[0],
+                cumulative_sums_prox=cumulative_sums_prox_ptr,
+                feature_hist=&feature_hist[0],
+                prox=prox,
+                sum_gradient=sum_gradient,
+                n_iter=n_iter
+            )
+            if status == -1:
+                break
+
+            # check if the stopping criteria is reached
+            max_change = 0.0
+            max_weight = 0.0
+            for idx in range(n_features * n_classes):
+                max_weight = fmax{{name_suffix}}(max_weight, fabs(weights[idx]))
+                max_change = fmax{{name_suffix}}(max_change, fabs(weights[idx] - previous_weights[idx]))
+                previous_weights[idx] = weights[idx]
+            if ((max_weight != 0 and max_change / max_weight <= tol)
+                or max_weight == 0 and max_change == 0):
+                if verbose:
+                    end_time = time(NULL)
+                    with gil:
+                        print("convergence after %d epochs took %d seconds" %
+                              (n_iter + 1, end_time - start_time))
+                break
+            elif verbose:
+                printf('Epoch %d, change: %.8f\n', n_iter + 1,
+                                                  max_change / max_weight)
+    n_iter += 1
+    # We do the error treatment here based on error code in status to avoid
+    # re-acquiring the GIL within the cython code, which slows the computation
+    # when the sag/saga solver is used concurrently in multiple Python threads.
+    if status == -1:
+        raise ValueError(("Floating-point under-/overflow occurred at epoch"
+                          " #%d. Scaling input data with StandardScaler or"
+                          " MinMaxScaler might help.") % n_iter)
+
+    if verbose and n_iter >= max_iter:
+        end_time = time(NULL)
+        print(("max_iter reached after %d seconds") %
+              (end_time - start_time))
+
+    return num_seen, n_iter
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef int scale_weights{{name_suffix}}(
+    {{c_type}}* weights,
+    {{c_type}}* wscale,
+    int n_features,
+    int n_samples,
+    int n_classes,
+    int sample_itr,
+    {{c_type}}* cumulative_sums,
+    {{c_type}}* cumulative_sums_prox,
+    int* feature_hist,
+    bint prox,
+    {{c_type}}* sum_gradient,
+    int n_iter
+) noexcept nogil:
+    """Scale the weights and reset wscale to 1.0 for numerical stability, and
+    reset the just-in-time (JIT) update system.
+
+    See `sag{{name_suffix}}`'s docstring about the JIT update system.
+
+    wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr)
+    can become very small, so we reset it every n_samples iterations to 1.0 for
+    numerical stability. To be able to scale, we first need to update every
+    coefficients and reset the just-in-time update system.
+    This also limits the size of `cumulative_sums`.
+    """
+
+    cdef int status
+    status = lagged_update{{name_suffix}}(
+        weights,
+        wscale[0],
+        n_features,
+        n_samples,
+        n_classes,
+        sample_itr + 1,
+        cumulative_sums,
+        cumulative_sums_prox,
+        feature_hist,
+        prox,
+        sum_gradient,
+        NULL,
+        True,
+        n_iter
+    )
+    # if lagged update succeeded, reset wscale to 1.0
+    if status == 0:
+        wscale[0] = 1.0
+    return status
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef int lagged_update{{name_suffix}}(
+    {{c_type}}* weights,
+    {{c_type}} wscale,
+    int xnnz,
+    int n_samples,
+    int n_classes,
+    int sample_itr,
+    {{c_type}}* cumulative_sums,
+    {{c_type}}* cumulative_sums_prox,
+    int* feature_hist,
+    bint prox,
+    {{c_type}}* sum_gradient,
+    int* x_ind_ptr,
+    bint reset,
+    int n_iter
+) noexcept nogil:
+    """Hard perform the JIT updates for non-zero features of present sample.
+
+    See `sag{{name_suffix}}`'s docstring about the JIT update system.
+
+    The updates that awaits are kept in memory using cumulative_sums,
+    cumulative_sums_prox, wscale and feature_hist. See original SAGA paper
+    (Defazio et al. 2014) for details. If reset=True, we also reset wscale to
+    1 (this is done at the end of each epoch).
+    """
+    cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind
+    cdef {{c_type}} cum_sum, grad_step, prox_step, cum_sum_prox
+    for feature_ind in range(xnnz):
+        if not reset:
+            feature_ind = x_ind_ptr[feature_ind]
+        f_idx = feature_ind * n_classes
+
+        cum_sum = cumulative_sums[sample_itr - 1]
+        if prox:
+            cum_sum_prox = cumulative_sums_prox[sample_itr - 1]
+        if feature_hist[feature_ind] != 0:
+            cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1]
+            if prox:
+                cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1]
+        if not prox:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                weights[idx] -= cum_sum * sum_gradient[idx]
+                if reset:
+                    weights[idx] *= wscale
+                    if not isfinite(weights[idx]):
+                        # returning here does not require the gil as the return
+                        # type is a C integer
+                        return -1
+        else:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox:
+                    # In this case, we can perform all the gradient steps and
+                    # all the proximal steps in this order, which is more
+                    # efficient than unrolling all the lagged updates.
+                    # Idea taken from scikit-learn-contrib/lightning.
+                    weights[idx] -= cum_sum * sum_gradient[idx]
+                    weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx],
+                                                      cum_sum_prox)
+                else:
+                    last_update_ind = feature_hist[feature_ind]
+                    if last_update_ind == -1:
+                        last_update_ind = sample_itr - 1
+                    for lagged_ind in range(sample_itr - 1,
+                                   last_update_ind - 1, -1):
+                        if lagged_ind > 0:
+                            grad_step = (cumulative_sums[lagged_ind]
+                               - cumulative_sums[lagged_ind - 1])
+                            prox_step = (cumulative_sums_prox[lagged_ind]
+                               - cumulative_sums_prox[lagged_ind - 1])
+                        else:
+                            grad_step = cumulative_sums[lagged_ind]
+                            prox_step = cumulative_sums_prox[lagged_ind]
+                        weights[idx] -= sum_gradient[idx] * grad_step
+                        weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx],
+                                                          prox_step)
+
+                if reset:
+                    weights[idx] *= wscale
+                    # check to see that the weight is not inf or NaN
+                    if not isfinite(weights[idx]):
+                        return -1
+        if reset:
+            feature_hist[feature_ind] = sample_itr % n_samples
+        else:
+            feature_hist[feature_ind] = sample_itr
+
+    if reset:
+        cumulative_sums[sample_itr - 1] = 0.0
+        if prox:
+            cumulative_sums_prox[sample_itr - 1] = 0.0
+
+    return 0
+
+{{endfor}}
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef void predict_sample{{name_suffix}}(
+    {{c_type}}* x_data_ptr,
+    int* x_ind_ptr,
+    int xnnz,
+    {{c_type}}* w_data_ptr,
+    {{c_type}} wscale,
+    {{c_type}}* intercept,
+    {{c_type}}* prediction,
+    int n_classes
+) noexcept nogil:
+    """Compute the prediction given sparse sample x and dense weight w.
+
+    Parameters
+    ----------
+    x_data_ptr : pointer
+        Pointer to the data of the sample x
+
+    x_ind_ptr : pointer
+        Pointer to the indices of the sample  x
+
+    xnnz : int
+        Number of non-zero element in the sample  x
+
+    w_data_ptr : pointer
+        Pointer to the data of the weights w
+
+    wscale : {{c_type}}
+        Scale of the weights w
+
+    intercept : pointer
+        Pointer to the intercept
+
+    prediction : pointer
+        Pointer to store the resulting prediction
+
+    n_classes : int
+        Number of classes in multinomial case. Equals 1 in binary case.
+
+    """
+    cdef int feature_ind, class_ind, j
+    cdef {{c_type}} innerprod
+
+    for class_ind in range(n_classes):
+        innerprod = 0.0
+        # Compute the dot product only on non-zero elements of x
+        for j in range(xnnz):
+            feature_ind = x_ind_ptr[j]
+            innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] *
+                          x_data_ptr[j])
+
+        prediction[class_ind] = wscale * innerprod + intercept[class_ind]
+
+
+{{endfor}}
+
+
+def _multinomial_grad_loss_all_samples(
+    SequentialDataset64 dataset,
+    double[:, ::1] weights_array,
+    double[::1] intercept_array,
+    int n_samples,
+    int n_features,
+    int n_classes
+):
+    """Compute multinomial gradient and loss across all samples.
+
+    Used for testing purpose only.
+    """
+    cdef double *x_data_ptr = NULL
+    cdef int *x_ind_ptr = NULL
+    cdef int xnnz = -1
+    cdef double y
+    cdef double sample_weight
+
+    cdef double wscale = 1.0
+    cdef int i, j, class_ind, feature_ind
+    cdef double val
+    cdef double sum_loss = 0.0
+
+    cdef MultinomialLogLoss64 multiloss = MultinomialLogLoss64()
+
+    cdef double[:, ::1] sum_gradient_array = np.zeros((n_features, n_classes), dtype=np.double, order="c")
+    cdef double* sum_gradient = &sum_gradient_array[0, 0]
+
+    cdef double[::1] prediction = np.zeros(n_classes, dtype=np.double, order="c")
+
+    cdef double[::1] gradient = np.zeros(n_classes, dtype=np.double, order="c")
+
+    with nogil:
+        for i in range(n_samples):
+            # get next sample on the dataset
+            dataset.next(
+                &x_data_ptr,
+                &x_ind_ptr,
+                &xnnz,
+                &y,
+                &sample_weight
+            )
+
+            # prediction of the multinomial classifier for the sample
+            predict_sample64(
+                x_data_ptr,
+                x_ind_ptr,
+                xnnz,
+                &weights_array[0, 0],
+                wscale,
+                &intercept_array[0],
+                &prediction[0],
+                n_classes
+            )
+
+            # compute the gradient for this sample, given the prediction
+            multiloss.dloss(y, &prediction[0], n_classes, sample_weight, &gradient[0])
+
+            # compute the loss for this sample, given the prediction
+            sum_loss += multiloss._loss(y, &prediction[0], n_classes, sample_weight)
+
+            # update the sum of the gradient
+            for j in range(xnnz):
+                feature_ind = x_ind_ptr[j]
+                val = x_data_ptr[j]
+                for class_ind in range(n_classes):
+                    sum_gradient[feature_ind * n_classes + class_ind] += gradient[class_ind] * val
+
+    return sum_loss, sum_gradient_array
@@ -0,0 +1,26 @@
+# License: BSD 3 clause
+"""Helper to load LossFunction from sgd_fast.pyx to sag_fast.pyx"""
+
+cdef class LossFunction:
+    cdef double loss(self, double y, double p) noexcept nogil
+    cdef double dloss(self, double y, double p) noexcept nogil
+
+
+cdef class Regression(LossFunction):
+    cdef double loss(self, double y, double p) noexcept nogil
+    cdef double dloss(self, double y, double p) noexcept nogil
+
+
+cdef class Classification(LossFunction):
+    cdef double loss(self, double y, double p) noexcept nogil
+    cdef double dloss(self, double y, double p) noexcept nogil
+
+
+cdef class Log(Classification):
+    cdef double loss(self, double y, double p) noexcept nogil
+    cdef double dloss(self, double y, double p) noexcept nogil
+
+
+cdef class SquaredLoss(Regression):
+    cdef double loss(self, double y, double p) noexcept nogil
+    cdef double dloss(self, double y, double p) noexcept nogil
@@ -0,0 +1,780 @@
+{{py:
+
+"""
+Template file to easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: _sgd_fast.pyx
+
+Each relevant function is duplicated for the dtypes float and double.
+The keywords between double braces are substituted in setup.py.
+
+Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+         Mathieu Blondel (partial_fit support)
+         Rob Zinkov (passive-aggressive)
+         Lars Buitinck
+
+License: BSD 3 clause
+"""
+
+# The dtypes are defined as follows (name_suffix, c_type, np_type)
+dtypes = [
+    ("64", "double", "np.float64"),
+    ("32", "float", "np.float32"),
+]
+
+}}
+"""SGD implementation"""
+
+import numpy as np
+from time import time
+
+from cython cimport floating
+from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
+
+from ..utils._typedefs cimport uint32_t
+from ..utils._weight_vector cimport WeightVector32, WeightVector64
+from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
+
+
+cdef extern from *:
+    """
+    /* Penalty constants */
+    #define NO_PENALTY 0
+    #define L1 1
+    #define L2 2
+    #define ELASTICNET 3
+
+    /* Learning rate constants */
+    #define CONSTANT 1
+    #define OPTIMAL 2
+    #define INVSCALING 3
+    #define ADAPTIVE 4
+    #define PA1 5
+    #define PA2 6
+    """
+    int NO_PENALTY = 0
+    int L1 = 1
+    int L2 = 2
+    int ELASTICNET = 3
+
+    int CONSTANT = 1
+    int OPTIMAL = 2
+    int INVSCALING = 3
+    int ADAPTIVE = 4
+    int PA1 = 5
+    int PA2 = 6
+
+
+# ----------------------------------------
+# Extension Types for Loss Functions
+# ----------------------------------------
+
+cdef class LossFunction:
+    """Base class for convex loss functions"""
+
+    cdef double loss(self, double y, double p) noexcept nogil:
+        """Evaluate the loss function.
+
+        Parameters
+        ----------
+        y : double
+            The true value (aka target).
+        p : double
+            The prediction, `p = w^T x + intercept`.
+
+        Returns
+        -------
+        double
+            The loss evaluated at `p` and `y`.
+        """
+        return 0.
+
+    def py_dloss(self, double p, double y):
+        """Python version of `dloss` for testing.
+
+        Pytest needs a python function and can't use cdef functions.
+
+        Parameters
+        ----------
+        p : double
+            The prediction, `p = w^T x`.
+        y : double
+            The true value (aka target).
+
+        Returns
+        -------
+        double
+            The derivative of the loss function with regards to `p`.
+        """
+        return self.dloss(y, p)
+
+    def py_loss(self, double p, double y):
+        """Python version of `loss` for testing.
+
+        Pytest needs a python function and can't use cdef functions.
+
+        Parameters
+        ----------
+        p : double
+            The prediction, `p = w^T x + intercept`.
+        y : double
+            The true value (aka target).
+
+        Returns
+        -------
+        double
+            The loss evaluated at `p` and `y`.
+        """
+        return self.loss(y, p)
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        """Evaluate the derivative of the loss function with respect to
+        the prediction `p`.
+
+        Parameters
+        ----------
+        y : double
+            The true value (aka target).
+        p : double
+            The prediction, `p = w^T x`.
+
+        Returns
+        -------
+        double
+            The derivative of the loss function with regards to `p`.
+        """
+        return 0.
+
+
+cdef class Regression(LossFunction):
+    """Base class for loss functions for regression"""
+
+    cdef double loss(self, double y, double p) noexcept nogil:
+        return 0.
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        return 0.
+
+
+cdef class Classification(LossFunction):
+    """Base class for loss functions for classification"""
+
+    cdef double loss(self, double y, double p) noexcept nogil:
+        return 0.
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        return 0.
+
+
+cdef class ModifiedHuber(Classification):
+    """Modified Huber loss for binary classification with y in {-1, 1}
+
+    This is equivalent to quadratically smoothed SVM with gamma = 2.
+
+    See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
+    Stochastic Gradient Descent', ICML'04.
+    """
+    cdef double loss(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        if z >= 1.0:
+            return 0.0
+        elif z >= -1.0:
+            return (1.0 - z) * (1.0 - z)
+        else:
+            return -4.0 * z
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        if z >= 1.0:
+            return 0.0
+        elif z >= -1.0:
+            return 2.0 * (1.0 - z) * -y
+        else:
+            return -4.0 * y
+
+    def __reduce__(self):
+        return ModifiedHuber, ()
+
+
+cdef class Hinge(Classification):
+    """Hinge loss for binary classification tasks with y in {-1,1}
+
+    Parameters
+    ----------
+
+    threshold : float > 0.0
+        Margin threshold. When threshold=1.0, one gets the loss used by SVM.
+        When threshold=0.0, one gets the loss used by the Perceptron.
+    """
+
+    cdef double threshold
+
+    def __init__(self, double threshold=1.0):
+        self.threshold = threshold
+
+    cdef double loss(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        if z <= self.threshold:
+            return self.threshold - z
+        return 0.0
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        if z <= self.threshold:
+            return -y
+        return 0.0
+
+    def __reduce__(self):
+        return Hinge, (self.threshold,)
+
+
+cdef class SquaredHinge(Classification):
+    """Squared Hinge loss for binary classification tasks with y in {-1,1}
+
+    Parameters
+    ----------
+
+    threshold : float > 0.0
+        Margin threshold. When threshold=1.0, one gets the loss used by
+        (quadratically penalized) SVM.
+    """
+
+    cdef double threshold
+
+    def __init__(self, double threshold=1.0):
+        self.threshold = threshold
+
+    cdef double loss(self, double y, double p) noexcept nogil:
+        cdef double z = self.threshold - p * y
+        if z > 0:
+            return z * z
+        return 0.0
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        cdef double z = self.threshold - p * y
+        if z > 0:
+            return -2 * y * z
+        return 0.0
+
+    def __reduce__(self):
+        return SquaredHinge, (self.threshold,)
+
+
+cdef class Log(Classification):
+    """Logistic regression loss for binary classification with y in {-1, 1}"""
+
+    cdef double loss(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        # approximately equal and saves the computation of the log
+        if z > 18:
+            return exp(-z)
+        if z < -18:
+            return -z
+        return log(1.0 + exp(-z))
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        cdef double z = p * y
+        # approximately equal and saves the computation of the log
+        if z > 18.0:
+            return exp(-z) * -y
+        if z < -18.0:
+            return -y
+        return -y / (exp(z) + 1.0)
+
+    def __reduce__(self):
+        return Log, ()
+
+
+cdef class SquaredLoss(Regression):
+    """Squared loss traditional used in linear regression."""
+    cdef double loss(self, double y, double p) noexcept nogil:
+        return 0.5 * (p - y) * (p - y)
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        return p - y
+
+    def __reduce__(self):
+        return SquaredLoss, ()
+
+
+cdef class Huber(Regression):
+    """Huber regression loss
+
+    Variant of the SquaredLoss that is robust to outliers (quadratic near zero,
+    linear in for large errors).
+
+    https://en.wikipedia.org/wiki/Huber_Loss_Function
+    """
+
+    cdef double c
+
+    def __init__(self, double c):
+        self.c = c
+
+    cdef double loss(self, double y, double p) noexcept nogil:
+        cdef double r = p - y
+        cdef double abs_r = fabs(r)
+        if abs_r <= self.c:
+            return 0.5 * r * r
+        else:
+            return self.c * abs_r - (0.5 * self.c * self.c)
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        cdef double r = p - y
+        cdef double abs_r = fabs(r)
+        if abs_r <= self.c:
+            return r
+        elif r > 0.0:
+            return self.c
+        else:
+            return -self.c
+
+    def __reduce__(self):
+        return Huber, (self.c,)
+
+
+cdef class EpsilonInsensitive(Regression):
+    """Epsilon-Insensitive loss (used by SVR).
+
+    loss = max(0, |y - p| - epsilon)
+    """
+
+    cdef double epsilon
+
+    def __init__(self, double epsilon):
+        self.epsilon = epsilon
+
+    cdef double loss(self, double y, double p) noexcept nogil:
+        cdef double ret = fabs(y - p) - self.epsilon
+        return ret if ret > 0 else 0
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        if y - p > self.epsilon:
+            return -1
+        elif p - y > self.epsilon:
+            return 1
+        else:
+            return 0
+
+    def __reduce__(self):
+        return EpsilonInsensitive, (self.epsilon,)
+
+
+cdef class SquaredEpsilonInsensitive(Regression):
+    """Epsilon-Insensitive loss.
+
+    loss = max(0, |y - p| - epsilon)^2
+    """
+
+    cdef double epsilon
+
+    def __init__(self, double epsilon):
+        self.epsilon = epsilon
+
+    cdef double loss(self, double y, double p) noexcept nogil:
+        cdef double ret = fabs(y - p) - self.epsilon
+        return ret * ret if ret > 0 else 0
+
+    cdef double dloss(self, double y, double p) noexcept nogil:
+        cdef double z
+        z = y - p
+        if z > self.epsilon:
+            return -2 * (z - self.epsilon)
+        elif z < -self.epsilon:
+            return 2 * (-z - self.epsilon)
+        else:
+            return 0
+
+    def __reduce__(self):
+        return SquaredEpsilonInsensitive, (self.epsilon,)
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+def _plain_sgd{{name_suffix}}(
+    const {{c_type}}[::1] weights,
+    double intercept,
+    const {{c_type}}[::1] average_weights,
+    double average_intercept,
+    LossFunction loss,
+    int penalty_type,
+    double alpha,
+    double C,
+    double l1_ratio,
+    SequentialDataset{{name_suffix}} dataset,
+    const unsigned char[::1] validation_mask,
+    bint early_stopping,
+    validation_score_cb,
+    int n_iter_no_change,
+    unsigned int max_iter,
+    double tol,
+    int fit_intercept,
+    int verbose,
+    bint shuffle,
+    uint32_t seed,
+    double weight_pos,
+    double weight_neg,
+    int learning_rate,
+    double eta0,
+    double power_t,
+    bint one_class,
+    double t=1.0,
+    double intercept_decay=1.0,
+    int average=0,
+):
+    """SGD for generic loss functions and penalties with optional averaging
+
+    Parameters
+    ----------
+    weights : ndarray[{{c_type}}, ndim=1]
+        The allocated vector of weights.
+    intercept : double
+        The initial intercept.
+    average_weights : ndarray[{{c_type}}, ndim=1]
+        The average weights as computed for ASGD. Should be None if average
+        is 0.
+    average_intercept : double
+        The average intercept for ASGD. Should be 0 if average is 0.
+    loss : LossFunction
+        A concrete ``LossFunction`` object.
+    penalty_type : int
+        The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
+    alpha : float
+        The regularization parameter.
+    C : float
+        Maximum step size for passive aggressive.
+    l1_ratio : float
+        The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
+        l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
+    dataset : SequentialDataset
+        A concrete ``SequentialDataset`` object.
+    validation_mask : ndarray[unsigned char, ndim=1]
+        Equal to True on the validation set.
+    early_stopping : boolean
+        Whether to use a stopping criterion based on the validation set.
+    validation_score_cb : callable
+        A callable to compute a validation score given the current
+        coefficients and intercept values.
+        Used only if early_stopping is True.
+    n_iter_no_change : int
+        Number of iteration with no improvement to wait before stopping.
+    max_iter : int
+        The maximum number of iterations (epochs).
+    tol: double
+        The tolerance for the stopping criterion.
+    fit_intercept : int
+        Whether or not to fit the intercept (1 or 0).
+    verbose : int
+        Print verbose output; 0 for quite.
+    shuffle : boolean
+        Whether to shuffle the training data before each epoch.
+    weight_pos : float
+        The weight of the positive class.
+    weight_neg : float
+        The weight of the negative class.
+    seed : uint32_t
+        Seed of the pseudorandom number generator used to shuffle the data.
+    learning_rate : int
+        The learning rate:
+        (1) constant, eta = eta0
+        (2) optimal, eta = 1.0/(alpha * t).
+        (3) inverse scaling, eta = eta0 / pow(t, power_t)
+        (4) adaptive decrease
+        (5) Passive Aggressive-I, eta = min(alpha, loss/norm(x))
+        (6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha)
+    eta0 : double
+        The initial learning rate.
+    power_t : double
+        The exponent for inverse scaling learning rate.
+    one_class : boolean
+        Whether to solve the One-Class SVM optimization problem.
+    t : double
+        Initial state of the learning rate. This value is equal to the
+        iteration count except when the learning rate is set to `optimal`.
+        Default: 1.0.
+    average : int
+        The number of iterations before averaging starts. average=1 is
+        equivalent to averaging for all iterations.
+
+
+    Returns
+    -------
+    weights : array, shape=[n_features]
+        The fitted weight vector.
+    intercept : float
+        The fitted intercept term.
+    average_weights : array shape=[n_features]
+        The averaged weights across iterations. Values are valid only if
+        average > 0.
+    average_intercept : float
+        The averaged intercept across iterations.
+        Values are valid only if average > 0.
+    n_iter_ : int
+        The actual number of iter (epochs).
+    """
+
+    # get the data information into easy vars
+    cdef Py_ssize_t n_samples = dataset.n_samples
+    cdef Py_ssize_t n_features = weights.shape[0]
+
+    cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
+    cdef {{c_type}} *x_data_ptr = NULL
+    cdef int *x_ind_ptr = NULL
+
+    # helper variables
+    cdef int no_improvement_count = 0
+    cdef bint infinity = False
+    cdef int xnnz
+    cdef double eta = 0.0
+    cdef double p = 0.0
+    cdef double update = 0.0
+    cdef double intercept_update = 0.0
+    cdef double sumloss = 0.0
+    cdef double score = 0.0
+    cdef double best_loss = INFINITY
+    cdef double best_score = -INFINITY
+    cdef {{c_type}} y = 0.0
+    cdef {{c_type}} sample_weight
+    cdef {{c_type}} class_weight = 1.0
+    cdef unsigned int count = 0
+    cdef unsigned int train_count = n_samples - np.sum(validation_mask)
+    cdef unsigned int epoch = 0
+    cdef unsigned int i = 0
+    cdef int is_hinge = isinstance(loss, Hinge)
+    cdef double optimal_init = 0.0
+    cdef double dloss = 0.0
+    cdef double MAX_DLOSS = 1e12
+
+    cdef long long sample_index
+
+    # q vector is only used for L1 regularization
+    cdef {{c_type}}[::1] q = None
+    cdef {{c_type}} * q_data_ptr = NULL
+    if penalty_type == L1 or penalty_type == ELASTICNET:
+        q = np.zeros((n_features,), dtype={{np_type}}, order="c")
+        q_data_ptr = &q[0]
+    cdef double u = 0.0
+
+    if penalty_type == L2:
+        l1_ratio = 0.0
+    elif penalty_type == L1:
+        l1_ratio = 1.0
+
+    eta = eta0
+
+    if learning_rate == OPTIMAL:
+        typw = np.sqrt(1.0 / np.sqrt(alpha))
+        # computing eta0, the initial learning rate
+        initial_eta0 = typw / max(1.0, loss.dloss(1.0, -typw))
+        # initialize t such that eta at first sample equals eta0
+        optimal_init = 1.0 / (initial_eta0 * alpha)
+
+    t_start = time()
+    with nogil:
+        for epoch in range(max_iter):
+            sumloss = 0
+            if verbose > 0:
+                with gil:
+                    print("-- Epoch %d" % (epoch + 1))
+            if shuffle:
+                dataset.shuffle(seed)
+            for i in range(n_samples):
+                dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
+                             &y, &sample_weight)
+
+                sample_index = dataset.index_data_ptr[dataset.current_index]
+                if validation_mask[sample_index]:
+                    # do not learn on the validation set
+                    continue
+
+                p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
+                if learning_rate == OPTIMAL:
+                    eta = 1.0 / (alpha * (optimal_init + t - 1))
+                elif learning_rate == INVSCALING:
+                    eta = eta0 / pow(t, power_t)
+
+                if verbose or not early_stopping:
+                    sumloss += loss.loss(y, p)
+
+                if y > 0.0:
+                    class_weight = weight_pos
+                else:
+                    class_weight = weight_neg
+
+                if learning_rate == PA1:
+                    update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
+                    if update == 0:
+                        continue
+                    update = min(C, loss.loss(y, p) / update)
+                elif learning_rate == PA2:
+                    update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
+                    update = loss.loss(y, p) / (update + 0.5 / C)
+                else:
+                    dloss = loss.dloss(y, p)
+                    # clip dloss with large values to avoid numerical
+                    # instabilities
+                    if dloss < -MAX_DLOSS:
+                        dloss = -MAX_DLOSS
+                    elif dloss > MAX_DLOSS:
+                        dloss = MAX_DLOSS
+                    update = -eta * dloss
+
+                if learning_rate >= PA1:
+                    if is_hinge:
+                        # classification
+                        update *= y
+                    elif y - p < 0:
+                        # regression
+                        update *= -1
+
+                update *= class_weight * sample_weight
+
+                if penalty_type >= L2:
+                    # do not scale to negative values when eta or alpha are too
+                    # big: instead set the weights to zero
+                    w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))
+
+                if update != 0.0:
+                    w.add(x_data_ptr, x_ind_ptr, xnnz, update)
+                if fit_intercept == 1:
+                    intercept_update = update
+                    if one_class:  # specific for One-Class SVM
+                        intercept_update -= 2. * eta * alpha
+                    if intercept_update != 0:
+                        intercept += intercept_update * intercept_decay
+
+                if 0 < average <= t:
+                    # compute the average for the intercept and update the
+                    # average weights, this is done regardless as to whether
+                    # the update is 0
+
+                    w.add_average(x_data_ptr, x_ind_ptr, xnnz,
+                                  update, (t - average + 1))
+                    average_intercept += ((intercept - average_intercept) /
+                                          (t - average + 1))
+
+                if penalty_type == L1 or penalty_type == ELASTICNET:
+                    u += (l1_ratio * eta * alpha)
+                    l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)
+
+                t += 1
+                count += 1
+
+            # report epoch information
+            if verbose > 0:
+                with gil:
+                    print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
+                          "Avg. loss: %f"
+                          % (w.norm(), np.nonzero(weights)[0].shape[0],
+                             intercept, count, sumloss / train_count))
+                    print("Total training time: %.2f seconds."
+                          % (time() - t_start))
+
+            # floating-point under-/overflow check.
+            if (not isfinite(intercept) or any_nonfinite(weights)):
+                infinity = True
+                break
+
+            # evaluate the score on the validation set
+            if early_stopping:
+                with gil:
+                    score = validation_score_cb(weights.base, intercept)
+                if tol > -INFINITY and score < best_score + tol:
+                    no_improvement_count += 1
+                else:
+                    no_improvement_count = 0
+                if score > best_score:
+                    best_score = score
+            # or evaluate the loss on the training set
+            else:
+                if tol > -INFINITY and sumloss > best_loss - tol * train_count:
+                    no_improvement_count += 1
+                else:
+                    no_improvement_count = 0
+                if sumloss < best_loss:
+                    best_loss = sumloss
+
+            # if there is no improvement several times in a row
+            if no_improvement_count >= n_iter_no_change:
+                if learning_rate == ADAPTIVE and eta > 1e-6:
+                    eta = eta / 5
+                    no_improvement_count = 0
+                else:
+                    if verbose:
+                        with gil:
+                            print("Convergence after %d epochs took %.2f "
+                                  "seconds" % (epoch + 1, time() - t_start))
+                    break
+
+    if infinity:
+        raise ValueError(("Floating-point under-/overflow occurred at epoch"
+                          " #%d. Scaling input data with StandardScaler or"
+                          " MinMaxScaler might help.") % (epoch + 1))
+
+    w.reset_wscale()
+
+    return (
+        weights.base,
+        intercept,
+        None if average_weights is None else average_weights.base,
+        average_intercept,
+        epoch + 1
+    )
+
+{{endfor}}
+
+
+cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil:
+    for i in range(w.shape[0]):
+        if not isfinite(w[i]):
+            return True
+    return 0
+
+
+cdef inline double sqnorm(
+    floating * x_data_ptr,
+    int * x_ind_ptr,
+    int xnnz,
+) noexcept nogil:
+    cdef double x_norm = 0.0
+    cdef int j
+    cdef double z
+    for j in range(xnnz):
+        z = x_data_ptr[j]
+        x_norm += z * z
+    return x_norm
+
+
+{{for name_suffix, c_type, np_type in dtypes}}
+
+cdef void l1penalty{{name_suffix}}(
+    WeightVector{{name_suffix}} w,
+    {{c_type}} * q_data_ptr,
+    int *x_ind_ptr,
+    int xnnz,
+    double u,
+) noexcept nogil:
+    """Apply the L1 penalty to each updated feature
+
+    This implements the truncated gradient approach by
+    [Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
+    """
+    cdef double z = 0.0
+    cdef int j = 0
+    cdef int idx = 0
+    cdef double wscale = w.wscale
+    cdef {{c_type}} *w_data_ptr = w.w_data_ptr
+    for j in range(xnnz):
+        idx = x_ind_ptr[j]
+        z = w_data_ptr[idx]
+        if wscale * z > 0.0:
+            w_data_ptr[idx] = max(
+                0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))
+
+        elif wscale * z < 0.0:
+            w_data_ptr[idx] = min(
+                0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))
+
+        q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)
+
+{{endfor}}
@@ -0,0 +1,456 @@
+"""
+A Theil-Sen Estimator for Multiple Linear Regression Model
+"""
+
+# Author: Florian Wilhelm <florian.wilhelm@gmail.com>
+#
+# License: BSD 3 clause
+
+
+import warnings
+from itertools import combinations
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+from scipy import linalg
+from scipy.linalg.lapack import get_lapack_funcs
+from scipy.special import binom
+
+from ..base import RegressorMixin, _fit_context
+from ..exceptions import ConvergenceWarning
+from ..utils import check_random_state
+from ..utils._param_validation import Interval
+from ..utils.parallel import Parallel, delayed
+from ._base import LinearModel
+
+_EPSILON = np.finfo(np.double).eps
+
+
+def _modified_weiszfeld_step(X, x_old):
+    """Modified Weiszfeld step.
+
+    This function defines one iteration step in order to approximate the
+    spatial median (L1 median). It is a form of an iteratively re-weighted
+    least squares method.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    x_old : ndarray of shape = (n_features,)
+        Current start vector.
+
+    Returns
+    -------
+    x_new : ndarray of shape (n_features,)
+        New iteration step.
+
+    References
+    ----------
+    - On Computation of Spatial Median for Robust Data Mining, 2005
+      T. Kärkkäinen and S. Äyrämö
+      http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
+    """
+    diff = X - x_old
+    diff_norm = np.sqrt(np.sum(diff**2, axis=1))
+    mask = diff_norm >= _EPSILON
+    # x_old equals one of our samples
+    is_x_old_in_X = int(mask.sum() < X.shape[0])
+
+    diff = diff[mask]
+    diff_norm = diff_norm[mask][:, np.newaxis]
+    quotient_norm = linalg.norm(np.sum(diff / diff_norm, axis=0))
+
+    if quotient_norm > _EPSILON:  # to avoid division by zero
+        new_direction = np.sum(X[mask, :] / diff_norm, axis=0) / np.sum(
+            1 / diff_norm, axis=0
+        )
+    else:
+        new_direction = 1.0
+        quotient_norm = 1.0
+
+    return (
+        max(0.0, 1.0 - is_x_old_in_X / quotient_norm) * new_direction
+        + min(1.0, is_x_old_in_X / quotient_norm) * x_old
+    )
+
+
+def _spatial_median(X, max_iter=300, tol=1.0e-3):
+    """Spatial median (L1 median).
+
+    The spatial median is member of a class of so-called M-estimators which
+    are defined by an optimization problem. Given a number of p points in an
+    n-dimensional space, the point x minimizing the sum of all distances to the
+    p other points is called spatial median.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Training vector, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    max_iter : int, default=300
+        Maximum number of iterations.
+
+    tol : float, default=1.e-3
+        Stop the algorithm if spatial_median has converged.
+
+    Returns
+    -------
+    spatial_median : ndarray of shape = (n_features,)
+        Spatial median.
+
+    n_iter : int
+        Number of iterations needed.
+
+    References
+    ----------
+    - On Computation of Spatial Median for Robust Data Mining, 2005
+      T. Kärkkäinen and S. Äyrämö
+      http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
+    """
+    if X.shape[1] == 1:
+        return 1, np.median(X.ravel(), keepdims=True)
+
+    tol **= 2  # We are computing the tol on the squared norm
+    spatial_median_old = np.mean(X, axis=0)
+
+    for n_iter in range(max_iter):
+        spatial_median = _modified_weiszfeld_step(X, spatial_median_old)
+        if np.sum((spatial_median_old - spatial_median) ** 2) < tol:
+            break
+        else:
+            spatial_median_old = spatial_median
+    else:
+        warnings.warn(
+            "Maximum number of iterations {max_iter} reached in "
+            "spatial median for TheilSen regressor."
+            "".format(max_iter=max_iter),
+            ConvergenceWarning,
+        )
+    return n_iter, spatial_median
+
+
+def _breakdown_point(n_samples, n_subsamples):
+    """Approximation of the breakdown point.
+
+    Parameters
+    ----------
+    n_samples : int
+        Number of samples.
+
+    n_subsamples : int
+        Number of subsamples to consider.
+
+    Returns
+    -------
+    breakdown_point : float
+        Approximation of breakdown point.
+    """
+    return (
+        1
+        - (
+            0.5 ** (1 / n_subsamples) * (n_samples - n_subsamples + 1)
+            + n_subsamples
+            - 1
+        )
+        / n_samples
+    )
+
+
+def _lstsq(X, y, indices, fit_intercept):
+    """Least Squares Estimator for TheilSenRegressor class.
+
+    This function calculates the least squares method on a subset of rows of X
+    and y defined by the indices array. Optionally, an intercept column is
+    added if intercept is set to true.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Design matrix, where `n_samples` is the number of samples and
+        `n_features` is the number of features.
+
+    y : ndarray of shape (n_samples,)
+        Target vector, where `n_samples` is the number of samples.
+
+    indices : ndarray of shape (n_subpopulation, n_subsamples)
+        Indices of all subsamples with respect to the chosen subpopulation.
+
+    fit_intercept : bool
+        Fit intercept or not.
+
+    Returns
+    -------
+    weights : ndarray of shape (n_subpopulation, n_features + intercept)
+        Solution matrix of n_subpopulation solved least square problems.
+    """
+    fit_intercept = int(fit_intercept)
+    n_features = X.shape[1] + fit_intercept
+    n_subsamples = indices.shape[1]
+    weights = np.empty((indices.shape[0], n_features))
+    X_subpopulation = np.ones((n_subsamples, n_features))
+    # gelss need to pad y_subpopulation to be of the max dim of X_subpopulation
+    y_subpopulation = np.zeros((max(n_subsamples, n_features)))
+    (lstsq,) = get_lapack_funcs(("gelss",), (X_subpopulation, y_subpopulation))
+
+    for index, subset in enumerate(indices):
+        X_subpopulation[:, fit_intercept:] = X[subset, :]
+        y_subpopulation[:n_subsamples] = y[subset]
+        weights[index] = lstsq(X_subpopulation, y_subpopulation)[1][:n_features]
+
+    return weights
+
+
+class TheilSenRegressor(RegressorMixin, LinearModel):
+    """Theil-Sen Estimator: robust multivariate regression model.
+
+    The algorithm calculates least square solutions on subsets with size
+    n_subsamples of the samples in X. Any value of n_subsamples between the
+    number of features and samples leads to an estimator with a compromise
+    between robustness and efficiency. Since the number of least square
+    solutions is "n_samples choose n_subsamples", it can be extremely large
+    and can therefore be limited with max_subpopulation. If this limit is
+    reached, the subsets are chosen randomly. In a final step, the spatial
+    median (or L1 median) is calculated of all least square solutions.
+
+    Read more in the :ref:`User Guide <theil_sen_regression>`.
+
+    Parameters
+    ----------
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations.
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    max_subpopulation : int, default=1e4
+        Instead of computing with a set of cardinality 'n choose k', where n is
+        the number of samples and k is the number of subsamples (at least
+        number of features), consider only a stochastic subpopulation of a
+        given maximal size if 'n choose k' is larger than max_subpopulation.
+        For other than small problem sizes this parameter will determine
+        memory usage and runtime if n_subsamples is not changed. Note that the
+        data type should be int but floats such as 1e4 can be accepted too.
+
+    n_subsamples : int, default=None
+        Number of samples to calculate the parameters. This is at least the
+        number of features (plus 1 if fit_intercept=True) and the number of
+        samples as a maximum. A lower number leads to a higher breakdown
+        point and a low efficiency while a high number leads to a low
+        breakdown point and a high efficiency. If None, take the
+        minimum number of subsamples leading to maximal robustness.
+        If n_subsamples is set to n_samples, Theil-Sen is identical to least
+        squares.
+
+    max_iter : int, default=300
+        Maximum number of iterations for the calculation of spatial median.
+
+    tol : float, default=1e-3
+        Tolerance when calculating spatial median.
+
+    random_state : int, RandomState instance or None, default=None
+        A random number generator instance to define the state of the random
+        permutations generator. Pass an int for reproducible output across
+        multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int, default=None
+        Number of CPUs to use during the cross validation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : bool, default=False
+        Verbose mode when fitting the model.
+
+    Attributes
+    ----------
+    coef_ : ndarray of shape (n_features,)
+        Coefficients of the regression model (median of distribution).
+
+    intercept_ : float
+        Estimated intercept of regression model.
+
+    breakdown_ : float
+        Approximated breakdown point.
+
+    n_iter_ : int
+        Number of iterations needed for the spatial median.
+
+    n_subpopulation_ : int
+        Number of combinations taken into account from 'n choose k', where n is
+        the number of samples and k is the number of subsamples.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    HuberRegressor : Linear regression model that is robust to outliers.
+    RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
+    SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
+
+    References
+    ----------
+    - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009
+      Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
+      http://home.olemiss.edu/~xdang/papers/MTSE.pdf
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import TheilSenRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(
+    ...     n_samples=200, n_features=2, noise=4.0, random_state=0)
+    >>> reg = TheilSenRegressor(random_state=0).fit(X, y)
+    >>> reg.score(X, y)
+    0.9884...
+    >>> reg.predict(X[:1,])
+    array([-31.5871...])
+    """
+
+    _parameter_constraints: dict = {
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        # target_type should be Integral but can accept Real for backward compatibility
+        "max_subpopulation": [Interval(Real, 1, None, closed="left")],
+        "n_subsamples": [None, Integral],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "tol": [Interval(Real, 0.0, None, closed="left")],
+        "random_state": ["random_state"],
+        "n_jobs": [None, Integral],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        fit_intercept=True,
+        copy_X=True,
+        max_subpopulation=1e4,
+        n_subsamples=None,
+        max_iter=300,
+        tol=1.0e-3,
+        random_state=None,
+        n_jobs=None,
+        verbose=False,
+    ):
+        self.fit_intercept = fit_intercept
+        self.copy_X = copy_X
+        self.max_subpopulation = max_subpopulation
+        self.n_subsamples = n_subsamples
+        self.max_iter = max_iter
+        self.tol = tol
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+
+    def _check_subparams(self, n_samples, n_features):
+        n_subsamples = self.n_subsamples
+
+        if self.fit_intercept:
+            n_dim = n_features + 1
+        else:
+            n_dim = n_features
+
+        if n_subsamples is not None:
+            if n_subsamples > n_samples:
+                raise ValueError(
+                    "Invalid parameter since n_subsamples > "
+                    "n_samples ({0} > {1}).".format(n_subsamples, n_samples)
+                )
+            if n_samples >= n_features:
+                if n_dim > n_subsamples:
+                    plus_1 = "+1" if self.fit_intercept else ""
+                    raise ValueError(
+                        "Invalid parameter since n_features{0} "
+                        "> n_subsamples ({1} > {2})."
+                        "".format(plus_1, n_dim, n_subsamples)
+                    )
+            else:  # if n_samples < n_features
+                if n_subsamples != n_samples:
+                    raise ValueError(
+                        "Invalid parameter since n_subsamples != "
+                        "n_samples ({0} != {1}) while n_samples "
+                        "< n_features.".format(n_subsamples, n_samples)
+                    )
+        else:
+            n_subsamples = min(n_dim, n_samples)
+
+        all_combinations = max(1, np.rint(binom(n_samples, n_subsamples)))
+        n_subpopulation = int(min(self.max_subpopulation, all_combinations))
+
+        return n_subsamples, n_subpopulation
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit linear model.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+        y : ndarray of shape (n_samples,)
+            Target values.
+
+        Returns
+        -------
+        self : returns an instance of self.
+            Fitted `TheilSenRegressor` estimator.
+        """
+        random_state = check_random_state(self.random_state)
+        X, y = self._validate_data(X, y, y_numeric=True)
+        n_samples, n_features = X.shape
+        n_subsamples, self.n_subpopulation_ = self._check_subparams(
+            n_samples, n_features
+        )
+        self.breakdown_ = _breakdown_point(n_samples, n_subsamples)
+
+        if self.verbose:
+            print("Breakdown point: {0}".format(self.breakdown_))
+            print("Number of samples: {0}".format(n_samples))
+            tol_outliers = int(self.breakdown_ * n_samples)
+            print("Tolerable outliers: {0}".format(tol_outliers))
+            print("Number of subpopulations: {0}".format(self.n_subpopulation_))
+
+        # Determine indices of subpopulation
+        if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation:
+            indices = list(combinations(range(n_samples), n_subsamples))
+        else:
+            indices = [
+                random_state.choice(n_samples, size=n_subsamples, replace=False)
+                for _ in range(self.n_subpopulation_)
+            ]
+
+        n_jobs = effective_n_jobs(self.n_jobs)
+        index_list = np.array_split(indices, n_jobs)
+        weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+            delayed(_lstsq)(X, y, index_list[job], self.fit_intercept)
+            for job in range(n_jobs)
+        )
+        weights = np.vstack(weights)
+        self.n_iter_, coefs = _spatial_median(
+            weights, max_iter=self.max_iter, tol=self.tol
+        )
+
+        if self.fit_intercept:
+            self.intercept_ = coefs[0]
+            self.coef_ = coefs[1:]
+        else:
+            self.intercept_ = 0.0
+            self.coef_ = coefs
+
+        return self
@@ -0,0 +1,31 @@
+# .pyx is generated, so this is needed to make Cython compilation work
+linear_model_cython_tree = [
+  fs.copyfile('__init__.py'),
+  fs.copyfile('_sgd_fast.pxd'),
+]
+
+py.extension_module(
+  '_cd_fast',
+  ['_cd_fast.pyx', utils_cython_tree],
+  cython_args: cython_args,
+  subdir: 'sklearn/linear_model',
+  install: true
+)
+
+name_list = ['_sgd_fast', '_sag_fast']
+
+foreach name: name_list
+  pyx = custom_target(
+    name + '_pyx',
+    output: name + '.pyx',
+    input: name + '.pyx.tp',
+    command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@']
+  )
+  py.extension_module(
+    name,
+    [pyx, linear_model_cython_tree, utils_cython_tree],
+    cython_args: cython_args,
+    subdir: 'sklearn/linear_model',
+    install: true
+)
+endforeach
@@ -0,0 +1,789 @@
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#         Fabian Pedregosa <fabian.pedregosa@inria.fr>
+#         Maria Telenczuk <https://github.com/maikia>
+#
+# License: BSD 3 clause
+
+import warnings
+
+import numpy as np
+import pytest
+from scipy import linalg, sparse
+
+from sklearn.datasets import load_iris, make_regression, make_sparse_uncorrelated
+from sklearn.linear_model import LinearRegression
+from sklearn.linear_model._base import (
+    _preprocess_data,
+    _rescale_data,
+    make_dataset,
+)
+from sklearn.preprocessing import add_dummy_feature
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    LIL_CONTAINERS,
+)
+
+rtol = 1e-6
+
+
+def test_linear_regression():
+    # Test LinearRegression on a simple dataset.
+    # a simple dataset
+    X = [[1], [2]]
+    Y = [1, 2]
+
+    reg = LinearRegression()
+    reg.fit(X, Y)
+
+    assert_array_almost_equal(reg.coef_, [1])
+    assert_array_almost_equal(reg.intercept_, [0])
+    assert_array_almost_equal(reg.predict(X), [1, 2])
+
+    # test it also for degenerate input
+    X = [[1]]
+    Y = [0]
+
+    reg = LinearRegression()
+    reg.fit(X, Y)
+    assert_array_almost_equal(reg.coef_, [0])
+    assert_array_almost_equal(reg.intercept_, [0])
+    assert_array_almost_equal(reg.predict(X), [0])
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_linear_regression_sample_weights(
+    sparse_container, fit_intercept, global_random_seed
+):
+    rng = np.random.RandomState(global_random_seed)
+
+    # It would not work with under-determined systems
+    n_samples, n_features = 6, 5
+
+    X = rng.normal(size=(n_samples, n_features))
+    if sparse_container is not None:
+        X = sparse_container(X)
+    y = rng.normal(size=n_samples)
+
+    sample_weight = 1.0 + rng.uniform(size=n_samples)
+
+    # LinearRegression with explicit sample_weight
+    reg = LinearRegression(fit_intercept=fit_intercept)
+    reg.fit(X, y, sample_weight=sample_weight)
+    coefs1 = reg.coef_
+    inter1 = reg.intercept_
+
+    assert reg.coef_.shape == (X.shape[1],)  # sanity checks
+
+    # Closed form of the weighted least square
+    # theta = (X^T W X)^(-1) @ X^T W y
+    W = np.diag(sample_weight)
+    X_aug = X if not fit_intercept else add_dummy_feature(X)
+
+    Xw = X_aug.T @ W @ X_aug
+    yw = X_aug.T @ W @ y
+    coefs2 = linalg.solve(Xw, yw)
+
+    if not fit_intercept:
+        assert_allclose(coefs1, coefs2)
+    else:
+        assert_allclose(coefs1, coefs2[1:])
+        assert_allclose(inter1, coefs2[0])
+
+
+def test_raises_value_error_if_positive_and_sparse():
+    error_msg = "Sparse data was passed for X, but dense data is required."
+    # X must not be sparse if positive == True
+    X = sparse.eye(10)
+    y = np.ones(10)
+
+    reg = LinearRegression(positive=True)
+
+    with pytest.raises(TypeError, match=error_msg):
+        reg.fit(X, y)
+
+
+@pytest.mark.parametrize("n_samples, n_features", [(2, 3), (3, 2)])
+def test_raises_value_error_if_sample_weights_greater_than_1d(n_samples, n_features):
+    # Sample weights must be either scalar or 1D
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features)
+    y = rng.randn(n_samples)
+    sample_weights_OK = rng.randn(n_samples) ** 2 + 1
+    sample_weights_OK_1 = 1.0
+    sample_weights_OK_2 = 2.0
+
+    reg = LinearRegression()
+
+    # make sure the "OK" sample weights actually work
+    reg.fit(X, y, sample_weights_OK)
+    reg.fit(X, y, sample_weights_OK_1)
+    reg.fit(X, y, sample_weights_OK_2)
+
+
+def test_fit_intercept():
+    # Test assertions on betas shape.
+    X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]])
+    X3 = np.array(
+        [[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]]
+    )
+    y = np.array([1, 1])
+
+    lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y)
+    lr2_with_intercept = LinearRegression().fit(X2, y)
+
+    lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y)
+    lr3_with_intercept = LinearRegression().fit(X3, y)
+
+    assert lr2_with_intercept.coef_.shape == lr2_without_intercept.coef_.shape
+    assert lr3_with_intercept.coef_.shape == lr3_without_intercept.coef_.shape
+    assert lr2_without_intercept.coef_.ndim == lr3_without_intercept.coef_.ndim
+
+
+def test_linear_regression_sparse(global_random_seed):
+    # Test that linear regression also works with sparse data
+    rng = np.random.RandomState(global_random_seed)
+    n = 100
+    X = sparse.eye(n, n)
+    beta = rng.rand(n)
+    y = X @ beta
+
+    ols = LinearRegression()
+    ols.fit(X, y.ravel())
+    assert_array_almost_equal(beta, ols.coef_ + ols.intercept_)
+
+    assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_linear_regression_sparse_equal_dense(fit_intercept, csr_container):
+    # Test that linear regression agrees between sparse and dense
+    rng = np.random.RandomState(0)
+    n_samples = 200
+    n_features = 2
+    X = rng.randn(n_samples, n_features)
+    X[X < 0.1] = 0.0
+    Xcsr = csr_container(X)
+    y = rng.rand(n_samples)
+    params = dict(fit_intercept=fit_intercept)
+    clf_dense = LinearRegression(**params)
+    clf_sparse = LinearRegression(**params)
+    clf_dense.fit(X, y)
+    clf_sparse.fit(Xcsr, y)
+    assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_)
+    assert_allclose(clf_dense.coef_, clf_sparse.coef_)
+
+
+def test_linear_regression_multiple_outcome():
+    # Test multiple-outcome linear regressions
+    rng = np.random.RandomState(0)
+    X, y = make_regression(random_state=rng)
+
+    Y = np.vstack((y, y)).T
+    n_features = X.shape[1]
+
+    reg = LinearRegression()
+    reg.fit((X), Y)
+    assert reg.coef_.shape == (2, n_features)
+    Y_pred = reg.predict(X)
+    reg.fit(X, y)
+    y_pred = reg.predict(X)
+    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_linear_regression_sparse_multiple_outcome(global_random_seed, coo_container):
+    # Test multiple-outcome linear regressions with sparse data
+    rng = np.random.RandomState(global_random_seed)
+    X, y = make_sparse_uncorrelated(random_state=rng)
+    X = coo_container(X)
+    Y = np.vstack((y, y)).T
+    n_features = X.shape[1]
+
+    ols = LinearRegression()
+    ols.fit(X, Y)
+    assert ols.coef_.shape == (2, n_features)
+    Y_pred = ols.predict(X)
+    ols.fit(X, y.ravel())
+    y_pred = ols.predict(X)
+    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
+
+
+def test_linear_regression_positive():
+    # Test nonnegative LinearRegression on a simple dataset.
+    X = [[1], [2]]
+    y = [1, 2]
+
+    reg = LinearRegression(positive=True)
+    reg.fit(X, y)
+
+    assert_array_almost_equal(reg.coef_, [1])
+    assert_array_almost_equal(reg.intercept_, [0])
+    assert_array_almost_equal(reg.predict(X), [1, 2])
+
+    # test it also for degenerate input
+    X = [[1]]
+    y = [0]
+
+    reg = LinearRegression(positive=True)
+    reg.fit(X, y)
+    assert_allclose(reg.coef_, [0])
+    assert_allclose(reg.intercept_, [0])
+    assert_allclose(reg.predict(X), [0])
+
+
+def test_linear_regression_positive_multiple_outcome(global_random_seed):
+    # Test multiple-outcome nonnegative linear regressions
+    rng = np.random.RandomState(global_random_seed)
+    X, y = make_sparse_uncorrelated(random_state=rng)
+    Y = np.vstack((y, y)).T
+    n_features = X.shape[1]
+
+    ols = LinearRegression(positive=True)
+    ols.fit(X, Y)
+    assert ols.coef_.shape == (2, n_features)
+    assert np.all(ols.coef_ >= 0.0)
+    Y_pred = ols.predict(X)
+    ols.fit(X, y.ravel())
+    y_pred = ols.predict(X)
+    assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)
+
+
+def test_linear_regression_positive_vs_nonpositive(global_random_seed):
+    # Test differences with LinearRegression when positive=False.
+    rng = np.random.RandomState(global_random_seed)
+    X, y = make_sparse_uncorrelated(random_state=rng)
+
+    reg = LinearRegression(positive=True)
+    reg.fit(X, y)
+    regn = LinearRegression(positive=False)
+    regn.fit(X, y)
+
+    assert np.mean((reg.coef_ - regn.coef_) ** 2) > 1e-3
+
+
+def test_linear_regression_positive_vs_nonpositive_when_positive(global_random_seed):
+    # Test LinearRegression fitted coefficients
+    # when the problem is positive.
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 200
+    n_features = 4
+    X = rng.rand(n_samples, n_features)
+    y = X[:, 0] + 2 * X[:, 1] + 3 * X[:, 2] + 1.5 * X[:, 3]
+
+    reg = LinearRegression(positive=True)
+    reg.fit(X, y)
+    regn = LinearRegression(positive=False)
+    regn.fit(X, y)
+
+    assert np.mean((reg.coef_ - regn.coef_) ** 2) < 1e-6
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+@pytest.mark.parametrize("use_sw", [True, False])
+def test_inplace_data_preprocessing(sparse_container, use_sw, global_random_seed):
+    # Check that the data is not modified inplace by the linear regression
+    # estimator.
+    rng = np.random.RandomState(global_random_seed)
+    original_X_data = rng.randn(10, 12)
+    original_y_data = rng.randn(10, 2)
+    orginal_sw_data = rng.rand(10)
+
+    if sparse_container is not None:
+        X = sparse_container(original_X_data)
+    else:
+        X = original_X_data.copy()
+    y = original_y_data.copy()
+    # XXX: Note hat y_sparse is not supported (broken?) in the current
+    # implementation of LinearRegression.
+
+    if use_sw:
+        sample_weight = orginal_sw_data.copy()
+    else:
+        sample_weight = None
+
+    # Do not allow inplace preprocessing of X and y:
+    reg = LinearRegression()
+    reg.fit(X, y, sample_weight=sample_weight)
+    if sparse_container is not None:
+        assert_allclose(X.toarray(), original_X_data)
+    else:
+        assert_allclose(X, original_X_data)
+    assert_allclose(y, original_y_data)
+
+    if use_sw:
+        assert_allclose(sample_weight, orginal_sw_data)
+
+    # Allow inplace preprocessing of X and y
+    reg = LinearRegression(copy_X=False)
+    reg.fit(X, y, sample_weight=sample_weight)
+    if sparse_container is not None:
+        # No optimization relying on the inplace modification of sparse input
+        # data has been implemented at this time.
+        assert_allclose(X.toarray(), original_X_data)
+    else:
+        # X has been offset (and optionally rescaled by sample weights)
+        # inplace. The 0.42 threshold is arbitrary and has been found to be
+        # robust to any random seed in the admissible range.
+        assert np.linalg.norm(X - original_X_data) > 0.42
+
+    # y should not have been modified inplace by LinearRegression.fit.
+    assert_allclose(y, original_y_data)
+
+    if use_sw:
+        # Sample weights have no reason to ever be modified inplace.
+        assert_allclose(sample_weight, orginal_sw_data)
+
+
+def test_linear_regression_pd_sparse_dataframe_warning():
+    pd = pytest.importorskip("pandas")
+
+    # Warning is raised only when some of the columns is sparse
+    df = pd.DataFrame({"0": np.random.randn(10)})
+    for col in range(1, 4):
+        arr = np.random.randn(10)
+        arr[:8] = 0
+        # all columns but the first column is sparse
+        if col != 0:
+            arr = pd.arrays.SparseArray(arr, fill_value=0)
+        df[str(col)] = arr
+
+    msg = "pandas.DataFrame with sparse columns found."
+
+    reg = LinearRegression()
+    with pytest.warns(UserWarning, match=msg):
+        reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
+
+    # does not warn when the whole dataframe is sparse
+    df["0"] = pd.arrays.SparseArray(df["0"], fill_value=0)
+    assert hasattr(df, "sparse")
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
+
+
+def test_preprocess_data(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 200
+    n_features = 2
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+    expected_X_mean = np.mean(X, axis=0)
+    expected_y_mean = np.mean(y, axis=0)
+
+    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=False)
+    assert_array_almost_equal(X_mean, np.zeros(n_features))
+    assert_array_almost_equal(y_mean, 0)
+    assert_array_almost_equal(X_scale, np.ones(n_features))
+    assert_array_almost_equal(Xt, X)
+    assert_array_almost_equal(yt, y)
+
+    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=True)
+    assert_array_almost_equal(X_mean, expected_X_mean)
+    assert_array_almost_equal(y_mean, expected_y_mean)
+    assert_array_almost_equal(X_scale, np.ones(n_features))
+    assert_array_almost_equal(Xt, X - expected_X_mean)
+    assert_array_almost_equal(yt, y - expected_y_mean)
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS)
+def test_preprocess_data_multioutput(global_random_seed, sparse_container):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 200
+    n_features = 3
+    n_outputs = 2
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples, n_outputs)
+    expected_y_mean = np.mean(y, axis=0)
+
+    if sparse_container is not None:
+        X = sparse_container(X)
+
+    _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=False)
+    assert_array_almost_equal(y_mean, np.zeros(n_outputs))
+    assert_array_almost_equal(yt, y)
+
+    _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True)
+    assert_array_almost_equal(y_mean, expected_y_mean)
+    assert_array_almost_equal(yt, y - y_mean)
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+def test_preprocess_data_weighted(sparse_container, global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 200
+    n_features = 4
+    # Generate random data with 50% of zero values to make sure
+    # that the sparse variant of this test is actually sparse. This also
+    # shifts the mean value for each columns in X further away from
+    # zero.
+    X = rng.rand(n_samples, n_features)
+    X[X < 0.5] = 0.0
+
+    # Scale the first feature of X to be 10 larger than the other to
+    # better check the impact of feature scaling.
+    X[:, 0] *= 10
+
+    # Constant non-zero feature.
+    X[:, 2] = 1.0
+
+    # Constant zero feature (non-materialized in the sparse case)
+    X[:, 3] = 0.0
+    y = rng.rand(n_samples)
+
+    sample_weight = rng.rand(n_samples)
+    expected_X_mean = np.average(X, axis=0, weights=sample_weight)
+    expected_y_mean = np.average(y, axis=0, weights=sample_weight)
+
+    X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0)
+    X_sample_weight_var = np.average(
+        (X - X_sample_weight_avg) ** 2, weights=sample_weight, axis=0
+    )
+    constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
+    assert_array_equal(constant_mask, [0, 0, 1, 1])
+    expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(sample_weight.sum())
+
+    # near constant features should not be scaled
+    expected_X_scale[constant_mask] = 1
+
+    if sparse_container is not None:
+        X = sparse_container(X)
+
+    # normalize is False
+    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
+        X,
+        y,
+        fit_intercept=True,
+        sample_weight=sample_weight,
+    )
+    assert_array_almost_equal(X_mean, expected_X_mean)
+    assert_array_almost_equal(y_mean, expected_y_mean)
+    assert_array_almost_equal(X_scale, np.ones(n_features))
+    if sparse_container is not None:
+        assert_array_almost_equal(Xt.toarray(), X.toarray())
+    else:
+        assert_array_almost_equal(Xt, X - expected_X_mean)
+    assert_array_almost_equal(yt, y - expected_y_mean)
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_sparse_preprocess_data_offsets(global_random_seed, lil_container):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 200
+    n_features = 2
+    X = sparse.rand(n_samples, n_features, density=0.5, random_state=rng)
+    X = lil_container(X)
+    y = rng.rand(n_samples)
+    XA = X.toarray()
+
+    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=False)
+    assert_array_almost_equal(X_mean, np.zeros(n_features))
+    assert_array_almost_equal(y_mean, 0)
+    assert_array_almost_equal(X_scale, np.ones(n_features))
+    assert_array_almost_equal(Xt.toarray(), XA)
+    assert_array_almost_equal(yt, y)
+
+    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=True)
+    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
+    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
+    assert_array_almost_equal(X_scale, np.ones(n_features))
+    assert_array_almost_equal(Xt.toarray(), XA)
+    assert_array_almost_equal(yt, y - np.mean(y, axis=0))
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_csr_preprocess_data(csr_container):
+    # Test output format of _preprocess_data, when input is csr
+    X, y = make_regression()
+    X[X < 2.5] = 0.0
+    csr = csr_container(X)
+    csr_, y, _, _, _ = _preprocess_data(csr, y, fit_intercept=True)
+    assert csr_.format == "csr"
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+@pytest.mark.parametrize("to_copy", (True, False))
+def test_preprocess_copy_data_no_checks(sparse_container, to_copy):
+    X, y = make_regression()
+    X[X < 2.5] = 0.0
+
+    if sparse_container is not None:
+        X = sparse_container(X)
+
+    X_, y_, _, _, _ = _preprocess_data(
+        X, y, fit_intercept=True, copy=to_copy, check_input=False
+    )
+
+    if to_copy and sparse_container is not None:
+        assert not np.may_share_memory(X_.data, X.data)
+    elif to_copy:
+        assert not np.may_share_memory(X_, X)
+    elif sparse_container is not None:
+        assert np.may_share_memory(X_.data, X.data)
+    else:
+        assert np.may_share_memory(X_, X)
+
+
+def test_dtype_preprocess_data(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 200
+    n_features = 2
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+
+    X_32 = np.asarray(X, dtype=np.float32)
+    y_32 = np.asarray(y, dtype=np.float32)
+    X_64 = np.asarray(X, dtype=np.float64)
+    y_64 = np.asarray(y, dtype=np.float64)
+
+    for fit_intercept in [True, False]:
+        Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data(
+            X_32,
+            y_32,
+            fit_intercept=fit_intercept,
+        )
+
+        Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data(
+            X_64,
+            y_64,
+            fit_intercept=fit_intercept,
+        )
+
+        Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = _preprocess_data(
+            X_32,
+            y_64,
+            fit_intercept=fit_intercept,
+        )
+
+        Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = _preprocess_data(
+            X_64,
+            y_32,
+            fit_intercept=fit_intercept,
+        )
+
+        assert Xt_32.dtype == np.float32
+        assert yt_32.dtype == np.float32
+        assert X_mean_32.dtype == np.float32
+        assert y_mean_32.dtype == np.float32
+        assert X_scale_32.dtype == np.float32
+
+        assert Xt_64.dtype == np.float64
+        assert yt_64.dtype == np.float64
+        assert X_mean_64.dtype == np.float64
+        assert y_mean_64.dtype == np.float64
+        assert X_scale_64.dtype == np.float64
+
+        assert Xt_3264.dtype == np.float32
+        assert yt_3264.dtype == np.float32
+        assert X_mean_3264.dtype == np.float32
+        assert y_mean_3264.dtype == np.float32
+        assert X_scale_3264.dtype == np.float32
+
+        assert Xt_6432.dtype == np.float64
+        assert yt_6432.dtype == np.float64
+        assert X_mean_6432.dtype == np.float64
+        assert y_mean_6432.dtype == np.float64
+        assert X_scale_6432.dtype == np.float64
+
+        assert X_32.dtype == np.float32
+        assert y_32.dtype == np.float32
+        assert X_64.dtype == np.float64
+        assert y_64.dtype == np.float64
+
+        assert_array_almost_equal(Xt_32, Xt_64)
+        assert_array_almost_equal(yt_32, yt_64)
+        assert_array_almost_equal(X_mean_32, X_mean_64)
+        assert_array_almost_equal(y_mean_32, y_mean_64)
+        assert_array_almost_equal(X_scale_32, X_scale_64)
+
+
+@pytest.mark.parametrize("n_targets", [None, 2])
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+def test_rescale_data(n_targets, sparse_container, global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    n_samples = 200
+    n_features = 2
+
+    sample_weight = 1.0 + rng.rand(n_samples)
+    X = rng.rand(n_samples, n_features)
+    if n_targets is None:
+        y = rng.rand(n_samples)
+    else:
+        y = rng.rand(n_samples, n_targets)
+
+    expected_sqrt_sw = np.sqrt(sample_weight)
+    expected_rescaled_X = X * expected_sqrt_sw[:, np.newaxis]
+
+    if n_targets is None:
+        expected_rescaled_y = y * expected_sqrt_sw
+    else:
+        expected_rescaled_y = y * expected_sqrt_sw[:, np.newaxis]
+
+    if sparse_container is not None:
+        X = sparse_container(X)
+        if n_targets is None:
+            y = sparse_container(y.reshape(-1, 1))
+        else:
+            y = sparse_container(y)
+
+    rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight)
+
+    assert_allclose(sqrt_sw, expected_sqrt_sw)
+
+    if sparse_container is not None:
+        rescaled_X = rescaled_X.toarray()
+        rescaled_y = rescaled_y.toarray()
+        if n_targets is None:
+            rescaled_y = rescaled_y.ravel()
+
+    assert_allclose(rescaled_X, expected_rescaled_X)
+    assert_allclose(rescaled_y, expected_rescaled_y)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_fused_types_make_dataset(csr_container):
+    iris = load_iris()
+
+    X_32 = iris.data.astype(np.float32)
+    y_32 = iris.target.astype(np.float32)
+    X_csr_32 = csr_container(X_32)
+    sample_weight_32 = np.arange(y_32.size, dtype=np.float32)
+
+    X_64 = iris.data.astype(np.float64)
+    y_64 = iris.target.astype(np.float64)
+    X_csr_64 = csr_container(X_64)
+    sample_weight_64 = np.arange(y_64.size, dtype=np.float64)
+
+    # array
+    dataset_32, _ = make_dataset(X_32, y_32, sample_weight_32)
+    dataset_64, _ = make_dataset(X_64, y_64, sample_weight_64)
+    xi_32, yi_32, _, _ = dataset_32._next_py()
+    xi_64, yi_64, _, _ = dataset_64._next_py()
+    xi_data_32, _, _ = xi_32
+    xi_data_64, _, _ = xi_64
+
+    assert xi_data_32.dtype == np.float32
+    assert xi_data_64.dtype == np.float64
+    assert_allclose(yi_64, yi_32, rtol=rtol)
+
+    # csr
+    datasetcsr_32, _ = make_dataset(X_csr_32, y_32, sample_weight_32)
+    datasetcsr_64, _ = make_dataset(X_csr_64, y_64, sample_weight_64)
+    xicsr_32, yicsr_32, _, _ = datasetcsr_32._next_py()
+    xicsr_64, yicsr_64, _, _ = datasetcsr_64._next_py()
+    xicsr_data_32, _, _ = xicsr_32
+    xicsr_data_64, _, _ = xicsr_64
+
+    assert xicsr_data_32.dtype == np.float32
+    assert xicsr_data_64.dtype == np.float64
+
+    assert_allclose(xicsr_data_64, xicsr_data_32, rtol=rtol)
+    assert_allclose(yicsr_64, yicsr_32, rtol=rtol)
+
+    assert_array_equal(xi_data_32, xicsr_data_32)
+    assert_array_equal(xi_data_64, xicsr_data_64)
+    assert_array_equal(yi_32, yicsr_32)
+    assert_array_equal(yi_64, yicsr_64)
+
+
+@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_linear_regression_sample_weight_consistency(
+    sparse_container, fit_intercept, global_random_seed
+):
+    """Test that the impact of sample_weight is consistent.
+
+    Note that this test is stricter than the common test
+    check_sample_weights_invariance alone and also tests sparse X.
+    It is very similar to test_enet_sample_weight_consistency.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    n_samples, n_features = 10, 5
+
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+    if sparse_container is not None:
+        X = sparse_container(X)
+    params = dict(fit_intercept=fit_intercept)
+
+    reg = LinearRegression(**params).fit(X, y, sample_weight=None)
+    coef = reg.coef_.copy()
+    if fit_intercept:
+        intercept = reg.intercept_
+
+    # 1) sample_weight=np.ones(..) must be equivalent to sample_weight=None
+    # same check as check_sample_weights_invariance(name, reg, kind="ones"), but we also
+    # test with sparse input.
+    sample_weight = np.ones_like(y)
+    reg.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(reg.coef_, coef, rtol=1e-6)
+    if fit_intercept:
+        assert_allclose(reg.intercept_, intercept)
+
+    # 2) sample_weight=None should be equivalent to sample_weight = number
+    sample_weight = 123.0
+    reg.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(reg.coef_, coef, rtol=1e-6)
+    if fit_intercept:
+        assert_allclose(reg.intercept_, intercept)
+
+    # 3) scaling of sample_weight should have no effect, cf. np.average()
+    sample_weight = rng.uniform(low=0.01, high=2, size=X.shape[0])
+    reg = reg.fit(X, y, sample_weight=sample_weight)
+    coef = reg.coef_.copy()
+    if fit_intercept:
+        intercept = reg.intercept_
+
+    reg.fit(X, y, sample_weight=np.pi * sample_weight)
+    assert_allclose(reg.coef_, coef, rtol=1e-6 if sparse_container is None else 1e-5)
+    if fit_intercept:
+        assert_allclose(reg.intercept_, intercept)
+
+    # 4) setting elements of sample_weight to 0 is equivalent to removing these samples
+    sample_weight_0 = sample_weight.copy()
+    sample_weight_0[-5:] = 0
+    y[-5:] *= 1000  # to make excluding those samples important
+    reg.fit(X, y, sample_weight=sample_weight_0)
+    coef_0 = reg.coef_.copy()
+    if fit_intercept:
+        intercept_0 = reg.intercept_
+    reg.fit(X[:-5], y[:-5], sample_weight=sample_weight[:-5])
+    if fit_intercept and sparse_container is None:
+        # FIXME: https://github.com/scikit-learn/scikit-learn/issues/26164
+        # This often fails, e.g. when calling
+        # SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" pytest \
+        # sklearn/linear_model/tests/test_base.py\
+        # ::test_linear_regression_sample_weight_consistency
+        pass
+    else:
+        assert_allclose(reg.coef_, coef_0, rtol=1e-5)
+        if fit_intercept:
+            assert_allclose(reg.intercept_, intercept_0)
+
+    # 5) check that multiplying sample_weight by 2 is equivalent to repeating
+    # corresponding samples twice
+    if sparse_container is not None:
+        X2 = sparse.vstack([X, X[: n_samples // 2]], format="csc")
+    else:
+        X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
+    y2 = np.concatenate([y, y[: n_samples // 2]])
+    sample_weight_1 = sample_weight.copy()
+    sample_weight_1[: n_samples // 2] *= 2
+    sample_weight_2 = np.concatenate(
+        [sample_weight, sample_weight[: n_samples // 2]], axis=0
+    )
+
+    reg1 = LinearRegression(**params).fit(X, y, sample_weight=sample_weight_1)
+    reg2 = LinearRegression(**params).fit(X2, y2, sample_weight=sample_weight_2)
+    assert_allclose(reg1.coef_, reg2.coef_, rtol=1e-6)
+    if fit_intercept:
+        assert_allclose(reg1.intercept_, reg2.intercept_)
@@ -0,0 +1,299 @@
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#         Fabian Pedregosa <fabian.pedregosa@inria.fr>
+#
+# License: BSD 3 clause
+
+from math import log
+
+import numpy as np
+import pytest
+
+from sklearn import datasets
+from sklearn.linear_model import ARDRegression, BayesianRidge, Ridge
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    _convert_container,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_less,
+)
+from sklearn.utils.extmath import fast_logdet
+
+diabetes = datasets.load_diabetes()
+
+
+def test_bayesian_ridge_scores():
+    """Check scores attribute shape"""
+    X, y = diabetes.data, diabetes.target
+
+    clf = BayesianRidge(compute_score=True)
+    clf.fit(X, y)
+
+    assert clf.scores_.shape == (clf.n_iter_ + 1,)
+
+
+def test_bayesian_ridge_score_values():
+    """Check value of score on toy example.
+
+    Compute log marginal likelihood with equation (36) in Sparse Bayesian
+    Learning and the Relevance Vector Machine (Tipping, 2001):
+
+    - 0.5 * (log |Id/alpha + X.X^T/lambda| +
+             y^T.(Id/alpha + X.X^T/lambda).y + n * log(2 * pi))
+    + lambda_1 * log(lambda) - lambda_2 * lambda
+    + alpha_1 * log(alpha) - alpha_2 * alpha
+
+    and check equality with the score computed during training.
+    """
+
+    X, y = diabetes.data, diabetes.target
+    n_samples = X.shape[0]
+    # check with initial values of alpha and lambda (see code for the values)
+    eps = np.finfo(np.float64).eps
+    alpha_ = 1.0 / (np.var(y) + eps)
+    lambda_ = 1.0
+
+    # value of the parameters of the Gamma hyperpriors
+    alpha_1 = 0.1
+    alpha_2 = 0.1
+    lambda_1 = 0.1
+    lambda_2 = 0.1
+
+    # compute score using formula of docstring
+    score = lambda_1 * log(lambda_) - lambda_2 * lambda_
+    score += alpha_1 * log(alpha_) - alpha_2 * alpha_
+    M = 1.0 / alpha_ * np.eye(n_samples) + 1.0 / lambda_ * np.dot(X, X.T)
+    M_inv_dot_y = np.linalg.solve(M, y)
+    score += -0.5 * (
+        fast_logdet(M) + np.dot(y.T, M_inv_dot_y) + n_samples * log(2 * np.pi)
+    )
+
+    # compute score with BayesianRidge
+    clf = BayesianRidge(
+        alpha_1=alpha_1,
+        alpha_2=alpha_2,
+        lambda_1=lambda_1,
+        lambda_2=lambda_2,
+        max_iter=1,
+        fit_intercept=False,
+        compute_score=True,
+    )
+    clf.fit(X, y)
+
+    assert_almost_equal(clf.scores_[0], score, decimal=9)
+
+
+def test_bayesian_ridge_parameter():
+    # Test correctness of lambda_ and alpha_ parameters (GitHub issue #8224)
+    X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
+    y = np.array([1, 2, 3, 2, 0, 4, 5]).T
+
+    # A Ridge regression model using an alpha value equal to the ratio of
+    # lambda_ and alpha_ from the Bayesian Ridge model must be identical
+    br_model = BayesianRidge(compute_score=True).fit(X, y)
+    rr_model = Ridge(alpha=br_model.lambda_ / br_model.alpha_).fit(X, y)
+    assert_array_almost_equal(rr_model.coef_, br_model.coef_)
+    assert_almost_equal(rr_model.intercept_, br_model.intercept_)
+
+
+def test_bayesian_sample_weights():
+    # Test correctness of the sample_weights method
+    X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
+    y = np.array([1, 2, 3, 2, 0, 4, 5]).T
+    w = np.array([4, 3, 3, 1, 1, 2, 3]).T
+
+    # A Ridge regression model using an alpha value equal to the ratio of
+    # lambda_ and alpha_ from the Bayesian Ridge model must be identical
+    br_model = BayesianRidge(compute_score=True).fit(X, y, sample_weight=w)
+    rr_model = Ridge(alpha=br_model.lambda_ / br_model.alpha_).fit(
+        X, y, sample_weight=w
+    )
+    assert_array_almost_equal(rr_model.coef_, br_model.coef_)
+    assert_almost_equal(rr_model.intercept_, br_model.intercept_)
+
+
+def test_toy_bayesian_ridge_object():
+    # Test BayesianRidge on toy
+    X = np.array([[1], [2], [6], [8], [10]])
+    Y = np.array([1, 2, 6, 8, 10])
+    clf = BayesianRidge(compute_score=True)
+    clf.fit(X, Y)
+
+    # Check that the model could approximately learn the identity function
+    test = [[1], [3], [4]]
+    assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
+
+
+def test_bayesian_initial_params():
+    # Test BayesianRidge with initial values (alpha_init, lambda_init)
+    X = np.vander(np.linspace(0, 4, 5), 4)
+    y = np.array([0.0, 1.0, 0.0, -1.0, 0.0])  # y = (x^3 - 6x^2 + 8x) / 3
+
+    # In this case, starting from the default initial values will increase
+    # the bias of the fitted curve. So, lambda_init should be small.
+    reg = BayesianRidge(alpha_init=1.0, lambda_init=1e-3)
+    # Check the R2 score nearly equals to one.
+    r2 = reg.fit(X, y).score(X, y)
+    assert_almost_equal(r2, 1.0)
+
+
+def test_prediction_bayesian_ridge_ard_with_constant_input():
+    # Test BayesianRidge and ARDRegression predictions for edge case of
+    # constant target vectors
+    n_samples = 4
+    n_features = 5
+    random_state = check_random_state(42)
+    constant_value = random_state.rand()
+    X = random_state.random_sample((n_samples, n_features))
+    y = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype)
+    expected = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype)
+
+    for clf in [BayesianRidge(), ARDRegression()]:
+        y_pred = clf.fit(X, y).predict(X)
+        assert_array_almost_equal(y_pred, expected)
+
+
+def test_std_bayesian_ridge_ard_with_constant_input():
+    # Test BayesianRidge and ARDRegression standard dev. for edge case of
+    # constant target vector
+    # The standard dev. should be relatively small (< 0.01 is tested here)
+    n_samples = 10
+    n_features = 5
+    random_state = check_random_state(42)
+    constant_value = random_state.rand()
+    X = random_state.random_sample((n_samples, n_features))
+    y = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype)
+    expected_upper_boundary = 0.01
+
+    for clf in [BayesianRidge(), ARDRegression()]:
+        _, y_std = clf.fit(X, y).predict(X, return_std=True)
+        assert_array_less(y_std, expected_upper_boundary)
+
+
+def test_update_of_sigma_in_ard():
+    # Checks that `sigma_` is updated correctly after the last iteration
+    # of the ARDRegression algorithm. See issue #10128.
+    X = np.array([[1, 0], [0, 0]])
+    y = np.array([0, 0])
+    clf = ARDRegression(max_iter=1)
+    clf.fit(X, y)
+    # With the inputs above, ARDRegression prunes both of the two coefficients
+    # in the first iteration. Hence, the expected shape of `sigma_` is (0, 0).
+    assert clf.sigma_.shape == (0, 0)
+    # Ensure that no error is thrown at prediction stage
+    clf.predict(X, return_std=True)
+
+
+def test_toy_ard_object():
+    # Test BayesianRegression ARD classifier
+    X = np.array([[1], [2], [3]])
+    Y = np.array([1, 2, 3])
+    clf = ARDRegression(compute_score=True)
+    clf.fit(X, Y)
+
+    # Check that the model could approximately learn the identity function
+    test = [[1], [3], [4]]
+    assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
+
+
+@pytest.mark.parametrize("n_samples, n_features", ((10, 100), (100, 10)))
+def test_ard_accuracy_on_easy_problem(global_random_seed, n_samples, n_features):
+    # Check that ARD converges with reasonable accuracy on an easy problem
+    # (Github issue #14055)
+    X = np.random.RandomState(global_random_seed).normal(size=(250, 3))
+    y = X[:, 1]
+
+    regressor = ARDRegression()
+    regressor.fit(X, y)
+
+    abs_coef_error = np.abs(1 - regressor.coef_[1])
+    assert abs_coef_error < 1e-10
+
+
+@pytest.mark.parametrize("constructor_name", ["array", "dataframe"])
+def test_return_std(constructor_name):
+    # Test return_std option for both Bayesian regressors
+    def f(X):
+        return np.dot(X, w) + b
+
+    def f_noise(X, noise_mult):
+        return f(X) + np.random.randn(X.shape[0]) * noise_mult
+
+    d = 5
+    n_train = 50
+    n_test = 10
+
+    w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
+    b = 1.0
+
+    X = np.random.random((n_train, d))
+    X = _convert_container(X, constructor_name)
+
+    X_test = np.random.random((n_test, d))
+    X_test = _convert_container(X_test, constructor_name)
+
+    for decimal, noise_mult in enumerate([1, 0.1, 0.01]):
+        y = f_noise(X, noise_mult)
+
+        m1 = BayesianRidge()
+        m1.fit(X, y)
+        y_mean1, y_std1 = m1.predict(X_test, return_std=True)
+        assert_array_almost_equal(y_std1, noise_mult, decimal=decimal)
+
+        m2 = ARDRegression()
+        m2.fit(X, y)
+        y_mean2, y_std2 = m2.predict(X_test, return_std=True)
+        assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
+
+
+def test_update_sigma(global_random_seed):
+    # make sure the two update_sigma() helpers are equivalent. The woodbury
+    # formula is used when n_samples < n_features, and the other one is used
+    # otherwise.
+
+    rng = np.random.RandomState(global_random_seed)
+
+    # set n_samples == n_features to avoid instability issues when inverting
+    # the matrices. Using the woodbury formula would be unstable when
+    # n_samples > n_features
+    n_samples = n_features = 10
+    X = rng.randn(n_samples, n_features)
+    alpha = 1
+    lmbda = np.arange(1, n_features + 1)
+    keep_lambda = np.array([True] * n_features)
+
+    reg = ARDRegression()
+
+    sigma = reg._update_sigma(X, alpha, lmbda, keep_lambda)
+    sigma_woodbury = reg._update_sigma_woodbury(X, alpha, lmbda, keep_lambda)
+
+    np.testing.assert_allclose(sigma, sigma_woodbury)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("Estimator", [BayesianRidge, ARDRegression])
+def test_dtype_match(dtype, Estimator):
+    # Test that np.float32 input data is not cast to np.float64 when possible
+    X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]], dtype=dtype)
+    y = np.array([1, 2, 3, 2, 0, 4, 5]).T
+
+    model = Estimator()
+    # check type consistency
+    model.fit(X, y)
+    attributes = ["coef_", "sigma_"]
+    for attribute in attributes:
+        assert getattr(model, attribute).dtype == X.dtype
+
+    y_mean, y_std = model.predict(X, return_std=True)
+    assert y_mean.dtype == X.dtype
+    assert y_std.dtype == X.dtype
+
+
+@pytest.mark.parametrize("Estimator", [BayesianRidge, ARDRegression])
+def test_dtype_correctness(Estimator):
+    X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
+    y = np.array([1, 2, 3, 2, 0, 4, 5]).T
+    model = Estimator()
+    coef_32 = model.fit(X.astype(np.float32), y).coef_
+    coef_64 = model.fit(X.astype(np.float64), y).coef_
+    np.testing.assert_allclose(coef_32, coef_64, rtol=1e-4)
@@ -0,0 +1,147 @@
+# License: BSD 3 clause
+
+import inspect
+
+import numpy as np
+import pytest
+
+from sklearn.base import is_classifier
+from sklearn.datasets import make_low_rank_matrix
+from sklearn.linear_model import (
+    ARDRegression,
+    BayesianRidge,
+    ElasticNet,
+    ElasticNetCV,
+    Lars,
+    LarsCV,
+    Lasso,
+    LassoCV,
+    LassoLarsCV,
+    LassoLarsIC,
+    LinearRegression,
+    LogisticRegression,
+    LogisticRegressionCV,
+    MultiTaskElasticNet,
+    MultiTaskElasticNetCV,
+    MultiTaskLasso,
+    MultiTaskLassoCV,
+    OrthogonalMatchingPursuit,
+    OrthogonalMatchingPursuitCV,
+    PoissonRegressor,
+    Ridge,
+    RidgeCV,
+    SGDRegressor,
+    TweedieRegressor,
+)
+
+
+# Note: GammaRegressor() and TweedieRegressor(power != 1) have a non-canonical link.
+@pytest.mark.parametrize(
+    "model",
+    [
+        ARDRegression(),
+        BayesianRidge(),
+        ElasticNet(),
+        ElasticNetCV(),
+        Lars(),
+        LarsCV(),
+        Lasso(),
+        LassoCV(),
+        LassoLarsCV(),
+        LassoLarsIC(),
+        LinearRegression(),
+        # TODO: FIx SAGA which fails badly with sample_weights.
+        # This is a known limitation, see:
+        # https://github.com/scikit-learn/scikit-learn/issues/21305
+        pytest.param(
+            LogisticRegression(
+                penalty="elasticnet", solver="saga", l1_ratio=0.5, tol=1e-15
+            ),
+            marks=pytest.mark.xfail(reason="Missing importance sampling scheme"),
+        ),
+        LogisticRegressionCV(tol=1e-6),
+        MultiTaskElasticNet(),
+        MultiTaskElasticNetCV(),
+        MultiTaskLasso(),
+        MultiTaskLassoCV(),
+        OrthogonalMatchingPursuit(),
+        OrthogonalMatchingPursuitCV(),
+        PoissonRegressor(),
+        Ridge(),
+        RidgeCV(),
+        pytest.param(
+            SGDRegressor(tol=1e-15),
+            marks=pytest.mark.xfail(reason="Insufficient precision."),
+        ),
+        SGDRegressor(penalty="elasticnet", max_iter=10_000),
+        TweedieRegressor(power=0),  # same as Ridge
+    ],
+    ids=lambda x: x.__class__.__name__,
+)
+@pytest.mark.parametrize("with_sample_weight", [False, True])
+def test_balance_property(model, with_sample_weight, global_random_seed):
+    # Test that sum(y_predicted) == sum(y_observed) on the training set.
+    # This must hold for all linear models with deviance of an exponential disperson
+    # family as loss and the corresponding canonical link if fit_intercept=True.
+    # Examples:
+    #     - squared error and identity link (most linear models)
+    #     - Poisson deviance with log link
+    #     - log loss with logit link
+    # This is known as balance property or unconditional calibration/unbiasedness.
+    # For reference, see Corollary 3.18, 3.20 and Chapter 5.1.5 of
+    # M.V. Wuthrich and M. Merz, "Statistical Foundations of Actuarial Learning and its
+    # Applications" (June 3, 2022). http://doi.org/10.2139/ssrn.3822407
+
+    if (
+        with_sample_weight
+        and "sample_weight" not in inspect.signature(model.fit).parameters.keys()
+    ):
+        pytest.skip("Estimator does not support sample_weight.")
+
+    rel = 2e-4  # test precision
+    if isinstance(model, SGDRegressor):
+        rel = 1e-1
+    elif hasattr(model, "solver") and model.solver == "saga":
+        rel = 1e-2
+
+    rng = np.random.RandomState(global_random_seed)
+    n_train, n_features, n_targets = 100, 10, None
+    if isinstance(
+        model,
+        (MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLasso, MultiTaskLassoCV),
+    ):
+        n_targets = 3
+    X = make_low_rank_matrix(n_samples=n_train, n_features=n_features, random_state=rng)
+    if n_targets:
+        coef = (
+            rng.uniform(low=-2, high=2, size=(n_features, n_targets))
+            / np.max(X, axis=0)[:, None]
+        )
+    else:
+        coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+
+    expectation = np.exp(X @ coef + 0.5)
+    y = rng.poisson(lam=expectation) + 1  # strict positive, i.e. y > 0
+    if is_classifier(model):
+        y = (y > expectation + 1).astype(np.float64)
+
+    if with_sample_weight:
+        sw = rng.uniform(low=1, high=10, size=y.shape[0])
+    else:
+        sw = None
+
+    model.set_params(fit_intercept=True)  # to be sure
+    if with_sample_weight:
+        model.fit(X, y, sample_weight=sw)
+    else:
+        model.fit(X, y)
+
+    # Assert balance property.
+    if is_classifier(model):
+        assert np.average(model.predict_proba(X)[:, 1], weights=sw) == pytest.approx(
+            np.average(y, weights=sw), rel=rel
+        )
+    else:
+        assert np.average(model.predict(X), weights=sw, axis=0) == pytest.approx(
+            np.average(y, weights=sw, axis=0), rel=rel
+        )
@@ -0,0 +1,216 @@
+# Authors: Manoj Kumar mks542@nyu.edu
+# License: BSD 3 clause
+
+import numpy as np
+import pytest
+from scipy import optimize
+
+from sklearn.datasets import make_regression
+from sklearn.linear_model import HuberRegressor, LinearRegression, Ridge, SGDRegressor
+from sklearn.linear_model._huber import _huber_loss_and_gradient
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+
+def make_regression_with_outliers(n_samples=50, n_features=20):
+    rng = np.random.RandomState(0)
+    # Generate data with outliers by replacing 10% of the samples with noise.
+    X, y = make_regression(
+        n_samples=n_samples, n_features=n_features, random_state=0, noise=0.05
+    )
+
+    # Replace 10% of the sample with noise.
+    num_noise = int(0.1 * n_samples)
+    random_samples = rng.randint(0, n_samples, num_noise)
+    X[random_samples, :] = 2.0 * rng.normal(0, 1, (num_noise, X.shape[1]))
+    return X, y
+
+
+def test_huber_equals_lr_for_high_epsilon():
+    # Test that Ridge matches LinearRegression for large epsilon
+    X, y = make_regression_with_outliers()
+    lr = LinearRegression()
+    lr.fit(X, y)
+    huber = HuberRegressor(epsilon=1e3, alpha=0.0)
+    huber.fit(X, y)
+    assert_almost_equal(huber.coef_, lr.coef_, 3)
+    assert_almost_equal(huber.intercept_, lr.intercept_, 2)
+
+
+def test_huber_max_iter():
+    X, y = make_regression_with_outliers()
+    huber = HuberRegressor(max_iter=1)
+    huber.fit(X, y)
+    assert huber.n_iter_ == huber.max_iter
+
+
+def test_huber_gradient():
+    # Test that the gradient calculated by _huber_loss_and_gradient is correct
+    rng = np.random.RandomState(1)
+    X, y = make_regression_with_outliers()
+    sample_weight = rng.randint(1, 3, (y.shape[0]))
+
+    def loss_func(x, *args):
+        return _huber_loss_and_gradient(x, *args)[0]
+
+    def grad_func(x, *args):
+        return _huber_loss_and_gradient(x, *args)[1]
+
+    # Check using optimize.check_grad that the gradients are equal.
+    for _ in range(5):
+        # Check for both fit_intercept and otherwise.
+        for n_features in [X.shape[1] + 1, X.shape[1] + 2]:
+            w = rng.randn(n_features)
+            w[-1] = np.abs(w[-1])
+            grad_same = optimize.check_grad(
+                loss_func, grad_func, w, X, y, 0.01, 0.1, sample_weight
+            )
+            assert_almost_equal(grad_same, 1e-6, 4)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_huber_sample_weights(csr_container):
+    # Test sample_weights implementation in HuberRegressor"""
+
+    X, y = make_regression_with_outliers()
+    huber = HuberRegressor()
+    huber.fit(X, y)
+    huber_coef = huber.coef_
+    huber_intercept = huber.intercept_
+
+    # Rescale coefs before comparing with assert_array_almost_equal to make
+    # sure that the number of decimal places used is somewhat insensitive to
+    # the amplitude of the coefficients and therefore to the scale of the
+    # data and the regularization parameter
+    scale = max(np.mean(np.abs(huber.coef_)), np.mean(np.abs(huber.intercept_)))
+
+    huber.fit(X, y, sample_weight=np.ones(y.shape[0]))
+    assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
+    assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)
+
+    X, y = make_regression_with_outliers(n_samples=5, n_features=20)
+    X_new = np.vstack((X, np.vstack((X[1], X[1], X[3]))))
+    y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]]))
+    huber.fit(X_new, y_new)
+    huber_coef = huber.coef_
+    huber_intercept = huber.intercept_
+    sample_weight = np.ones(X.shape[0])
+    sample_weight[1] = 3
+    sample_weight[3] = 2
+    huber.fit(X, y, sample_weight=sample_weight)
+
+    assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
+    assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)
+
+    # Test sparse implementation with sample weights.
+    X_csr = csr_container(X)
+    huber_sparse = HuberRegressor()
+    huber_sparse.fit(X_csr, y, sample_weight=sample_weight)
+    assert_array_almost_equal(huber_sparse.coef_ / scale, huber_coef / scale)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_huber_sparse(csr_container):
+    X, y = make_regression_with_outliers()
+    huber = HuberRegressor(alpha=0.1)
+    huber.fit(X, y)
+
+    X_csr = csr_container(X)
+    huber_sparse = HuberRegressor(alpha=0.1)
+    huber_sparse.fit(X_csr, y)
+    assert_array_almost_equal(huber_sparse.coef_, huber.coef_)
+    assert_array_equal(huber.outliers_, huber_sparse.outliers_)
+
+
+def test_huber_scaling_invariant():
+    # Test that outliers filtering is scaling independent.
+    X, y = make_regression_with_outliers()
+    huber = HuberRegressor(fit_intercept=False, alpha=0.0)
+    huber.fit(X, y)
+    n_outliers_mask_1 = huber.outliers_
+    assert not np.all(n_outliers_mask_1)
+
+    huber.fit(X, 2.0 * y)
+    n_outliers_mask_2 = huber.outliers_
+    assert_array_equal(n_outliers_mask_2, n_outliers_mask_1)
+
+    huber.fit(2.0 * X, 2.0 * y)
+    n_outliers_mask_3 = huber.outliers_
+    assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
+
+
+def test_huber_and_sgd_same_results():
+    # Test they should converge to same coefficients for same parameters
+
+    X, y = make_regression_with_outliers(n_samples=10, n_features=2)
+
+    # Fit once to find out the scale parameter. Scale down X and y by scale
+    # so that the scale parameter is optimized to 1.0
+    huber = HuberRegressor(fit_intercept=False, alpha=0.0, epsilon=1.35)
+    huber.fit(X, y)
+    X_scale = X / huber.scale_
+    y_scale = y / huber.scale_
+    huber.fit(X_scale, y_scale)
+    assert_almost_equal(huber.scale_, 1.0, 3)
+
+    sgdreg = SGDRegressor(
+        alpha=0.0,
+        loss="huber",
+        shuffle=True,
+        random_state=0,
+        max_iter=10000,
+        fit_intercept=False,
+        epsilon=1.35,
+        tol=None,
+    )
+    sgdreg.fit(X_scale, y_scale)
+    assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
+
+
+def test_huber_warm_start():
+    X, y = make_regression_with_outliers()
+    huber_warm = HuberRegressor(alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1)
+
+    huber_warm.fit(X, y)
+    huber_warm_coef = huber_warm.coef_.copy()
+    huber_warm.fit(X, y)
+
+    # SciPy performs the tol check after doing the coef updates, so
+    # these would be almost same but not equal.
+    assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1)
+
+    assert huber_warm.n_iter_ == 0
+
+
+def test_huber_better_r2_score():
+    # Test that huber returns a better r2 score than non-outliers"""
+    X, y = make_regression_with_outliers()
+    huber = HuberRegressor(alpha=0.01)
+    huber.fit(X, y)
+    linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y
+    mask = np.abs(linear_loss) < huber.epsilon * huber.scale_
+    huber_score = huber.score(X[mask], y[mask])
+    huber_outlier_score = huber.score(X[~mask], y[~mask])
+
+    # The Ridge regressor should be influenced by the outliers and hence
+    # give a worse score on the non-outliers as compared to the huber
+    # regressor.
+    ridge = Ridge(alpha=0.01)
+    ridge.fit(X, y)
+    ridge_score = ridge.score(X[mask], y[mask])
+    ridge_outlier_score = ridge.score(X[~mask], y[~mask])
+    assert huber_score > ridge_score
+
+    # The huber model should also fit poorly on the outliers.
+    assert ridge_outlier_score > huber_outlier_score
+
+
+def test_huber_bool():
+    # Test that it does not crash with bool data
+    X, y = make_regression(n_samples=200, n_features=2, noise=4.0, random_state=0)
+    X_bool = X > 0
+    HuberRegressor().fit(X_bool, y)
@@ -0,0 +1,870 @@
+import warnings
+
+import numpy as np
+import pytest
+from scipy import linalg
+
+from sklearn import datasets, linear_model
+from sklearn.base import clone
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import (
+    Lars,
+    LarsCV,
+    LassoLars,
+    LassoLarsCV,
+    LassoLarsIC,
+    lars_path,
+)
+from sklearn.linear_model._least_angle import _lars_path_residues
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils._testing import (
+    TempMemmap,
+    assert_allclose,
+    assert_array_almost_equal,
+    ignore_warnings,
+)
+
+# TODO: use another dataset that has multiple drops
+diabetes = datasets.load_diabetes()
+X, y = diabetes.data, diabetes.target
+G = np.dot(X.T, X)
+Xy = np.dot(X.T, y)
+n_samples = y.size
+
+
+def test_simple():
+    # Principle of Lars is to keep covariances tied and decreasing
+
+    # also test verbose output
+    import sys
+    from io import StringIO
+
+    old_stdout = sys.stdout
+    try:
+        sys.stdout = StringIO()
+
+        _, _, coef_path_ = linear_model.lars_path(X, y, method="lar", verbose=10)
+
+        sys.stdout = old_stdout
+
+        for i, coef_ in enumerate(coef_path_.T):
+            res = y - np.dot(X, coef_)
+            cov = np.dot(X.T, res)
+            C = np.max(abs(cov))
+            eps = 1e-3
+            ocur = len(cov[C - eps < abs(cov)])
+            if i < X.shape[1]:
+                assert ocur == i + 1
+            else:
+                # no more than max_pred variables can go into the active set
+                assert ocur == X.shape[1]
+    finally:
+        sys.stdout = old_stdout
+
+
+def test_simple_precomputed():
+    # The same, with precomputed Gram matrix
+
+    _, _, coef_path_ = linear_model.lars_path(X, y, Gram=G, method="lar")
+
+    for i, coef_ in enumerate(coef_path_.T):
+        res = y - np.dot(X, coef_)
+        cov = np.dot(X.T, res)
+        C = np.max(abs(cov))
+        eps = 1e-3
+        ocur = len(cov[C - eps < abs(cov)])
+        if i < X.shape[1]:
+            assert ocur == i + 1
+        else:
+            # no more than max_pred variables can go into the active set
+            assert ocur == X.shape[1]
+
+
+def _assert_same_lars_path_result(output1, output2):
+    assert len(output1) == len(output2)
+    for o1, o2 in zip(output1, output2):
+        assert_allclose(o1, o2)
+
+
+@pytest.mark.parametrize("method", ["lar", "lasso"])
+@pytest.mark.parametrize("return_path", [True, False])
+def test_lars_path_gram_equivalent(method, return_path):
+    _assert_same_lars_path_result(
+        linear_model.lars_path_gram(
+            Xy=Xy, Gram=G, n_samples=n_samples, method=method, return_path=return_path
+        ),
+        linear_model.lars_path(X, y, Gram=G, method=method, return_path=return_path),
+    )
+
+
+def test_x_none_gram_none_raises_value_error():
+    # Test that lars_path with no X and Gram raises exception
+    Xy = np.dot(X.T, y)
+    with pytest.raises(ValueError, match="X and Gram cannot both be unspecified"):
+        linear_model.lars_path(None, y, Gram=None, Xy=Xy)
+
+
+def test_all_precomputed():
+    # Test that lars_path with precomputed Gram and Xy gives the right answer
+    G = np.dot(X.T, X)
+    Xy = np.dot(X.T, y)
+    for method in "lar", "lasso":
+        output = linear_model.lars_path(X, y, method=method)
+        output_pre = linear_model.lars_path(X, y, Gram=G, Xy=Xy, method=method)
+        for expected, got in zip(output, output_pre):
+            assert_array_almost_equal(expected, got)
+
+
+@pytest.mark.filterwarnings("ignore: `rcond` parameter will change")
+# numpy deprecation
+def test_lars_lstsq():
+    # Test that Lars gives least square solution at the end
+    # of the path
+    X1 = 3 * X  # use un-normalized dataset
+    clf = linear_model.LassoLars(alpha=0.0)
+    clf.fit(X1, y)
+    coef_lstsq = np.linalg.lstsq(X1, y, rcond=None)[0]
+    assert_array_almost_equal(clf.coef_, coef_lstsq)
+
+
+@pytest.mark.filterwarnings("ignore:`rcond` parameter will change")
+# numpy deprecation
+def test_lasso_gives_lstsq_solution():
+    # Test that Lars Lasso gives least square solution at the end
+    # of the path
+    _, _, coef_path_ = linear_model.lars_path(X, y, method="lasso")
+    coef_lstsq = np.linalg.lstsq(X, y)[0]
+    assert_array_almost_equal(coef_lstsq, coef_path_[:, -1])
+
+
+def test_collinearity():
+    # Check that lars_path is robust to collinearity in input
+    X = np.array([[3.0, 3.0, 1.0], [2.0, 2.0, 0.0], [1.0, 1.0, 0]])
+    y = np.array([1.0, 0.0, 0])
+    rng = np.random.RandomState(0)
+
+    f = ignore_warnings
+    _, _, coef_path_ = f(linear_model.lars_path)(X, y, alpha_min=0.01)
+    assert not np.isnan(coef_path_).any()
+    residual = np.dot(X, coef_path_[:, -1]) - y
+    assert (residual**2).sum() < 1.0  # just make sure it's bounded
+
+    n_samples = 10
+    X = rng.rand(n_samples, 5)
+    y = np.zeros(n_samples)
+    _, _, coef_path_ = linear_model.lars_path(
+        X,
+        y,
+        Gram="auto",
+        copy_X=False,
+        copy_Gram=False,
+        alpha_min=0.0,
+        method="lasso",
+        verbose=0,
+        max_iter=500,
+    )
+    assert_array_almost_equal(coef_path_, np.zeros_like(coef_path_))
+
+
+def test_no_path():
+    # Test that the ``return_path=False`` option returns the correct output
+    alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar")
+    alpha_, _, coef = linear_model.lars_path(X, y, method="lar", return_path=False)
+
+    assert_array_almost_equal(coef, coef_path_[:, -1])
+    assert alpha_ == alphas_[-1]
+
+
+def test_no_path_precomputed():
+    # Test that the ``return_path=False`` option with Gram remains correct
+    alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar", Gram=G)
+    alpha_, _, coef = linear_model.lars_path(
+        X, y, method="lar", Gram=G, return_path=False
+    )
+
+    assert_array_almost_equal(coef, coef_path_[:, -1])
+    assert alpha_ == alphas_[-1]
+
+
+def test_no_path_all_precomputed():
+    # Test that the ``return_path=False`` option with Gram and Xy remains
+    # correct
+    X, y = 3 * diabetes.data, diabetes.target
+    G = np.dot(X.T, X)
+    Xy = np.dot(X.T, y)
+    alphas_, _, coef_path_ = linear_model.lars_path(
+        X, y, method="lasso", Xy=Xy, Gram=G, alpha_min=0.9
+    )
+    alpha_, _, coef = linear_model.lars_path(
+        X, y, method="lasso", Gram=G, Xy=Xy, alpha_min=0.9, return_path=False
+    )
+
+    assert_array_almost_equal(coef, coef_path_[:, -1])
+    assert alpha_ == alphas_[-1]
+
+
+@pytest.mark.parametrize(
+    "classifier", [linear_model.Lars, linear_model.LarsCV, linear_model.LassoLarsIC]
+)
+def test_lars_precompute(classifier):
+    # Check for different values of precompute
+    G = np.dot(X.T, X)
+
+    clf = classifier(precompute=G)
+    output_1 = ignore_warnings(clf.fit)(X, y).coef_
+    for precompute in [True, False, "auto", None]:
+        clf = classifier(precompute=precompute)
+        output_2 = clf.fit(X, y).coef_
+        assert_array_almost_equal(output_1, output_2, decimal=8)
+
+
+def test_singular_matrix():
+    # Test when input is a singular matrix
+    X1 = np.array([[1, 1.0], [1.0, 1.0]])
+    y1 = np.array([1, 1])
+    _, _, coef_path = linear_model.lars_path(X1, y1)
+    assert_array_almost_equal(coef_path.T, [[0, 0], [1, 0]])
+
+
+def test_rank_deficient_design():
+    # consistency test that checks that LARS Lasso is handling rank
+    # deficient input data (with n_features < rank) in the same way
+    # as coordinate descent Lasso
+    y = [5, 0, 5]
+    for X in ([[5, 0], [0, 5], [10, 10]], [[10, 10, 0], [1e-32, 0, 0], [0, 0, 1]]):
+        # To be able to use the coefs to compute the objective function,
+        # we need to turn off normalization
+        lars = linear_model.LassoLars(0.1)
+        coef_lars_ = lars.fit(X, y).coef_
+        obj_lars = 1.0 / (2.0 * 3.0) * linalg.norm(
+            y - np.dot(X, coef_lars_)
+        ) ** 2 + 0.1 * linalg.norm(coef_lars_, 1)
+        coord_descent = linear_model.Lasso(0.1, tol=1e-6)
+        coef_cd_ = coord_descent.fit(X, y).coef_
+        obj_cd = (1.0 / (2.0 * 3.0)) * linalg.norm(
+            y - np.dot(X, coef_cd_)
+        ) ** 2 + 0.1 * linalg.norm(coef_cd_, 1)
+        assert obj_lars < obj_cd * (1.0 + 1e-8)
+
+
+def test_lasso_lars_vs_lasso_cd():
+    # Test that LassoLars and Lasso using coordinate descent give the
+    # same results.
+    X = 3 * diabetes.data
+
+    alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso")
+    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
+    for c, a in zip(lasso_path.T, alphas):
+        if a == 0:
+            continue
+        lasso_cd.alpha = a
+        lasso_cd.fit(X, y)
+        error = linalg.norm(c - lasso_cd.coef_)
+        assert error < 0.01
+
+    # similar test, with the classifiers
+    for alpha in np.linspace(1e-2, 1 - 1e-2, 20):
+        clf1 = linear_model.LassoLars(alpha=alpha).fit(X, y)
+        clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8).fit(X, y)
+        err = linalg.norm(clf1.coef_ - clf2.coef_)
+        assert err < 1e-3
+
+    # same test, with normalized data
+    X = diabetes.data
+    X = X - X.sum(axis=0)
+    X /= np.linalg.norm(X, axis=0)
+    alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso")
+    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
+    for c, a in zip(lasso_path.T, alphas):
+        if a == 0:
+            continue
+        lasso_cd.alpha = a
+        lasso_cd.fit(X, y)
+        error = linalg.norm(c - lasso_cd.coef_)
+        assert error < 0.01
+
+
+def test_lasso_lars_vs_lasso_cd_early_stopping():
+    # Test that LassoLars and Lasso using coordinate descent give the
+    # same results when early stopping is used.
+    # (test : before, in the middle, and in the last part of the path)
+    alphas_min = [10, 0.9, 1e-4]
+
+    X = diabetes.data
+
+    for alpha_min in alphas_min:
+        alphas, _, lasso_path = linear_model.lars_path(
+            X, y, method="lasso", alpha_min=alpha_min
+        )
+        lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
+        lasso_cd.alpha = alphas[-1]
+        lasso_cd.fit(X, y)
+        error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
+        assert error < 0.01
+
+    # same test, with normalization
+    X = diabetes.data - diabetes.data.sum(axis=0)
+    X /= np.linalg.norm(X, axis=0)
+
+    for alpha_min in alphas_min:
+        alphas, _, lasso_path = linear_model.lars_path(
+            X, y, method="lasso", alpha_min=alpha_min
+        )
+        lasso_cd = linear_model.Lasso(tol=1e-8)
+        lasso_cd.alpha = alphas[-1]
+        lasso_cd.fit(X, y)
+        error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
+        assert error < 0.01
+
+
+def test_lasso_lars_path_length():
+    # Test that the path length of the LassoLars is right
+    lasso = linear_model.LassoLars()
+    lasso.fit(X, y)
+    lasso2 = linear_model.LassoLars(alpha=lasso.alphas_[2])
+    lasso2.fit(X, y)
+    assert_array_almost_equal(lasso.alphas_[:3], lasso2.alphas_)
+    # Also check that the sequence of alphas is always decreasing
+    assert np.all(np.diff(lasso.alphas_) < 0)
+
+
+def test_lasso_lars_vs_lasso_cd_ill_conditioned():
+    # Test lasso lars on a very ill-conditioned design, and check that
+    # it does not blow up, and stays somewhat close to a solution given
+    # by the coordinate descent solver
+    # Also test that lasso_path (using lars_path output style) gives
+    # the same result as lars_path and previous lasso output style
+    # under these conditions.
+    rng = np.random.RandomState(42)
+
+    # Generate data
+    n, m = 70, 100
+    k = 5
+    X = rng.randn(n, m)
+    w = np.zeros((m, 1))
+    i = np.arange(0, m)
+    rng.shuffle(i)
+    supp = i[:k]
+    w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1)
+    y = np.dot(X, w)
+    sigma = 0.2
+    y += sigma * rng.rand(*y.shape)
+    y = y.squeeze()
+    lars_alphas, _, lars_coef = linear_model.lars_path(X, y, method="lasso")
+
+    _, lasso_coef2, _ = linear_model.lasso_path(X, y, alphas=lars_alphas, tol=1e-6)
+
+    assert_array_almost_equal(lars_coef, lasso_coef2, decimal=1)
+
+
+def test_lasso_lars_vs_lasso_cd_ill_conditioned2():
+    # Create an ill-conditioned situation in which the LARS has to go
+    # far in the path to converge, and check that LARS and coordinate
+    # descent give the same answers
+    # Note it used to be the case that Lars had to use the drop for good
+    # strategy for this but this is no longer the case with the
+    # equality_tolerance checks
+    X = [[1e20, 1e20, 0], [-1e-32, 0, 0], [1, 1, 1]]
+    y = [10, 10, 1]
+    alpha = 0.0001
+
+    def objective_function(coef):
+        return 1.0 / (2.0 * len(X)) * linalg.norm(
+            y - np.dot(X, coef)
+        ) ** 2 + alpha * linalg.norm(coef, 1)
+
+    lars = linear_model.LassoLars(alpha=alpha)
+    warning_message = "Regressors in active set degenerate."
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        lars.fit(X, y)
+    lars_coef_ = lars.coef_
+    lars_obj = objective_function(lars_coef_)
+
+    coord_descent = linear_model.Lasso(alpha=alpha, tol=1e-4)
+    cd_coef_ = coord_descent.fit(X, y).coef_
+    cd_obj = objective_function(cd_coef_)
+
+    assert lars_obj < cd_obj * (1.0 + 1e-8)
+
+
+def test_lars_add_features():
+    # assure that at least some features get added if necessary
+    # test for 6d2b4c
+    # Hilbert matrix
+    n = 5
+    H = 1.0 / (np.arange(1, n + 1) + np.arange(n)[:, np.newaxis])
+    clf = linear_model.Lars(fit_intercept=False).fit(H, np.arange(n))
+    assert np.all(np.isfinite(clf.coef_))
+
+
+def test_lars_n_nonzero_coefs(verbose=False):
+    lars = linear_model.Lars(n_nonzero_coefs=6, verbose=verbose)
+    lars.fit(X, y)
+    assert len(lars.coef_.nonzero()[0]) == 6
+    # The path should be of length 6 + 1 in a Lars going down to 6
+    # non-zero coefs
+    assert len(lars.alphas_) == 7
+
+
+@ignore_warnings
+def test_multitarget():
+    # Assure that estimators receiving multidimensional y do the right thing
+    Y = np.vstack([y, y**2]).T
+    n_targets = Y.shape[1]
+    estimators = [
+        linear_model.LassoLars(),
+        linear_model.Lars(),
+        # regression test for gh-1615
+        linear_model.LassoLars(fit_intercept=False),
+        linear_model.Lars(fit_intercept=False),
+    ]
+
+    for estimator in estimators:
+        estimator.fit(X, Y)
+        Y_pred = estimator.predict(X)
+        alphas, active, coef, path = (
+            estimator.alphas_,
+            estimator.active_,
+            estimator.coef_,
+            estimator.coef_path_,
+        )
+        for k in range(n_targets):
+            estimator.fit(X, Y[:, k])
+            y_pred = estimator.predict(X)
+            assert_array_almost_equal(alphas[k], estimator.alphas_)
+            assert_array_almost_equal(active[k], estimator.active_)
+            assert_array_almost_equal(coef[k], estimator.coef_)
+            assert_array_almost_equal(path[k], estimator.coef_path_)
+            assert_array_almost_equal(Y_pred[:, k], y_pred)
+
+
+def test_lars_cv():
+    # Test the LassoLarsCV object by checking that the optimal alpha
+    # increases as the number of samples increases.
+    # This property is not actually guaranteed in general and is just a
+    # property of the given dataset, with the given steps chosen.
+    old_alpha = 0
+    lars_cv = linear_model.LassoLarsCV()
+    for length in (400, 200, 100):
+        X = diabetes.data[:length]
+        y = diabetes.target[:length]
+        lars_cv.fit(X, y)
+        np.testing.assert_array_less(old_alpha, lars_cv.alpha_)
+        old_alpha = lars_cv.alpha_
+    assert not hasattr(lars_cv, "n_nonzero_coefs")
+
+
+def test_lars_cv_max_iter(recwarn):
+    warnings.simplefilter("always")
+    with np.errstate(divide="raise", invalid="raise"):
+        X = diabetes.data
+        y = diabetes.target
+        rng = np.random.RandomState(42)
+        x = rng.randn(len(y))
+        X = diabetes.data
+        X = np.c_[X, x, x]  # add correlated features
+        X = StandardScaler().fit_transform(X)
+        lars_cv = linear_model.LassoLarsCV(max_iter=5, cv=5)
+        lars_cv.fit(X, y)
+
+    # Check that there is no warning in general and no ConvergenceWarning
+    # in particular.
+    # Materialize the string representation of the warning to get a more
+    # informative error message in case of AssertionError.
+    recorded_warnings = [str(w) for w in recwarn]
+    assert len(recorded_warnings) == 0
+
+
+def test_lasso_lars_ic():
+    # Test the LassoLarsIC object by checking that
+    # - some good features are selected.
+    # - alpha_bic > alpha_aic
+    # - n_nonzero_bic < n_nonzero_aic
+    lars_bic = linear_model.LassoLarsIC("bic")
+    lars_aic = linear_model.LassoLarsIC("aic")
+    rng = np.random.RandomState(42)
+    X = diabetes.data
+    X = np.c_[X, rng.randn(X.shape[0], 5)]  # add 5 bad features
+    X = StandardScaler().fit_transform(X)
+    lars_bic.fit(X, y)
+    lars_aic.fit(X, y)
+    nonzero_bic = np.where(lars_bic.coef_)[0]
+    nonzero_aic = np.where(lars_aic.coef_)[0]
+    assert lars_bic.alpha_ > lars_aic.alpha_
+    assert len(nonzero_bic) < len(nonzero_aic)
+    assert np.max(nonzero_bic) < diabetes.data.shape[1]
+
+
+def test_lars_path_readonly_data():
+    # When using automated memory mapping on large input, the
+    # fold data is in read-only mode
+    # This is a non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/4597
+    splitted_data = train_test_split(X, y, random_state=42)
+    with TempMemmap(splitted_data) as (X_train, X_test, y_train, y_test):
+        # The following should not fail despite copy=False
+        _lars_path_residues(X_train, y_train, X_test, y_test, copy=False)
+
+
+def test_lars_path_positive_constraint():
+    # this is the main test for the positive parameter on the lars_path method
+    # the estimator classes just make use of this function
+
+    # we do the test on the diabetes dataset
+
+    # ensure that we get negative coefficients when positive=False
+    # and all positive when positive=True
+    # for method 'lar' (default) and lasso
+
+    err_msg = "Positive constraint not supported for 'lar' coding method."
+    with pytest.raises(ValueError, match=err_msg):
+        linear_model.lars_path(
+            diabetes["data"], diabetes["target"], method="lar", positive=True
+        )
+
+    method = "lasso"
+    _, _, coefs = linear_model.lars_path(
+        X, y, return_path=True, method=method, positive=False
+    )
+    assert coefs.min() < 0
+
+    _, _, coefs = linear_model.lars_path(
+        X, y, return_path=True, method=method, positive=True
+    )
+    assert coefs.min() >= 0
+
+
+# now we gonna test the positive option for all estimator classes
+
+default_parameter = {"fit_intercept": False}
+
+estimator_parameter_map = {
+    "LassoLars": {"alpha": 0.1},
+    "LassoLarsCV": {},
+    "LassoLarsIC": {},
+}
+
+
+def test_estimatorclasses_positive_constraint():
+    # testing the transmissibility for the positive option of all estimator
+    # classes in this same function here
+    default_parameter = {"fit_intercept": False}
+
+    estimator_parameter_map = {
+        "LassoLars": {"alpha": 0.1},
+        "LassoLarsCV": {},
+        "LassoLarsIC": {},
+    }
+    for estname in estimator_parameter_map:
+        params = default_parameter.copy()
+        params.update(estimator_parameter_map[estname])
+        estimator = getattr(linear_model, estname)(positive=False, **params)
+        estimator.fit(X, y)
+        assert estimator.coef_.min() < 0
+        estimator = getattr(linear_model, estname)(positive=True, **params)
+        estimator.fit(X, y)
+        assert min(estimator.coef_) >= 0
+
+
+def test_lasso_lars_vs_lasso_cd_positive():
+    # Test that LassoLars and Lasso using coordinate descent give the
+    # same results when using the positive option
+
+    # This test is basically a copy of the above with additional positive
+    # option. However for the middle part, the comparison of coefficient values
+    # for a range of alphas, we had to make an adaptations. See below.
+
+    # not normalized data
+    X = 3 * diabetes.data
+
+    alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True)
+    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
+    for c, a in zip(lasso_path.T, alphas):
+        if a == 0:
+            continue
+        lasso_cd.alpha = a
+        lasso_cd.fit(X, y)
+        error = linalg.norm(c - lasso_cd.coef_)
+        assert error < 0.01
+
+    # The range of alphas chosen for coefficient comparison here is restricted
+    # as compared with the above test without the positive option. This is due
+    # to the circumstance that the Lars-Lasso algorithm does not converge to
+    # the least-squares-solution for small alphas, see 'Least Angle Regression'
+    # by Efron et al 2004. The coefficients are typically in congruence up to
+    # the smallest alpha reached by the Lars-Lasso algorithm and start to
+    # diverge thereafter.  See
+    # https://gist.github.com/michigraber/7e7d7c75eca694c7a6ff
+
+    for alpha in np.linspace(6e-1, 1 - 1e-2, 20):
+        clf1 = linear_model.LassoLars(
+            fit_intercept=False, alpha=alpha, positive=True
+        ).fit(X, y)
+        clf2 = linear_model.Lasso(
+            fit_intercept=False, alpha=alpha, tol=1e-8, positive=True
+        ).fit(X, y)
+        err = linalg.norm(clf1.coef_ - clf2.coef_)
+        assert err < 1e-3
+
+    # normalized data
+    X = diabetes.data - diabetes.data.sum(axis=0)
+    X /= np.linalg.norm(X, axis=0)
+    alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True)
+    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
+    for c, a in zip(lasso_path.T[:-1], alphas[:-1]):  # don't include alpha=0
+        lasso_cd.alpha = a
+        lasso_cd.fit(X, y)
+        error = linalg.norm(c - lasso_cd.coef_)
+        assert error < 0.01
+
+
+def test_lasso_lars_vs_R_implementation():
+    # Test that sklearn LassoLars implementation agrees with the LassoLars
+    # implementation available in R (lars library) when fit_intercept=False.
+
+    # Let's generate the data used in the bug report 7778
+    y = np.array([-6.45006793, -3.51251449, -8.52445396, 6.12277822, -19.42109366])
+    x = np.array(
+        [
+            [0.47299829, 0, 0, 0, 0],
+            [0.08239882, 0.85784863, 0, 0, 0],
+            [0.30114139, -0.07501577, 0.80895216, 0, 0],
+            [-0.01460346, -0.1015233, 0.0407278, 0.80338378, 0],
+            [-0.69363927, 0.06754067, 0.18064514, -0.0803561, 0.40427291],
+        ]
+    )
+
+    X = x.T
+
+    # The R result was obtained using the following code:
+    #
+    # library(lars)
+    # model_lasso_lars = lars(X, t(y), type="lasso", intercept=FALSE,
+    #                         trace=TRUE, normalize=FALSE)
+    # r = t(model_lasso_lars$beta)
+    #
+
+    r = np.array(
+        [
+            [
+                0,
+                0,
+                0,
+                0,
+                0,
+                -79.810362809499026,
+                -83.528788732782829,
+                -83.777653739190711,
+                -83.784156932888934,
+                -84.033390591756657,
+            ],
+            [0, 0, 0, 0, -0.476624256777266, 0, 0, 0, 0, 0.025219751009936],
+            [
+                0,
+                -3.577397088285891,
+                -4.702795355871871,
+                -7.016748621359461,
+                -7.614898471899412,
+                -0.336938391359179,
+                0,
+                0,
+                0.001213370600853,
+                0.048162321585148,
+            ],
+            [
+                0,
+                0,
+                0,
+                2.231558436628169,
+                2.723267514525966,
+                2.811549786389614,
+                2.813766976061531,
+                2.817462468949557,
+                2.817368178703816,
+                2.816221090636795,
+            ],
+            [
+                0,
+                0,
+                -1.218422599914637,
+                -3.457726183014808,
+                -4.021304522060710,
+                -45.827461592423745,
+                -47.776608869312305,
+                -47.911561610746404,
+                -47.914845922736234,
+                -48.039562334265717,
+            ],
+        ]
+    )
+
+    model_lasso_lars = linear_model.LassoLars(alpha=0, fit_intercept=False)
+    model_lasso_lars.fit(X, y)
+    skl_betas = model_lasso_lars.coef_path_
+
+    assert_array_almost_equal(r, skl_betas, decimal=12)
+
+
+@pytest.mark.parametrize("copy_X", [True, False])
+def test_lasso_lars_copyX_behaviour(copy_X):
+    """
+    Test that user input regarding copy_X is not being overridden (it was until
+    at least version 0.21)
+
+    """
+    lasso_lars = LassoLarsIC(copy_X=copy_X, precompute=False)
+    rng = np.random.RandomState(0)
+    X = rng.normal(0, 1, (100, 5))
+    X_copy = X.copy()
+    y = X[:, 2]
+    lasso_lars.fit(X, y)
+    assert copy_X == np.array_equal(X, X_copy)
+
+
+@pytest.mark.parametrize("copy_X", [True, False])
+def test_lasso_lars_fit_copyX_behaviour(copy_X):
+    """
+    Test that user input to .fit for copy_X overrides default __init__ value
+
+    """
+    lasso_lars = LassoLarsIC(precompute=False)
+    rng = np.random.RandomState(0)
+    X = rng.normal(0, 1, (100, 5))
+    X_copy = X.copy()
+    y = X[:, 2]
+    lasso_lars.fit(X, y, copy_X=copy_X)
+    assert copy_X == np.array_equal(X, X_copy)
+
+
+@pytest.mark.parametrize("est", (LassoLars(alpha=1e-3), Lars()))
+def test_lars_with_jitter(est):
+    # Test that a small amount of jitter helps stability,
+    # using example provided in issue #2746
+
+    X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]])
+    y = [-2.5, -2.5]
+    expected_coef = [0, 2.5, 0, 2.5, 0]
+
+    # set to fit_intercept to False since target is constant and we want check
+    # the value of coef. coef would be all zeros otherwise.
+    est.set_params(fit_intercept=False)
+    est_jitter = clone(est).set_params(jitter=10e-8, random_state=0)
+
+    est.fit(X, y)
+    est_jitter.fit(X, y)
+
+    assert np.mean((est.coef_ - est_jitter.coef_) ** 2) > 0.1
+    np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3)
+
+
+def test_X_none_gram_not_none():
+    with pytest.raises(ValueError, match="X cannot be None if Gram is not None"):
+        lars_path(X=None, y=np.array([1]), Gram=True)
+
+
+def test_copy_X_with_auto_gram():
+    # Non-regression test for #17789, `copy_X=True` and Gram='auto' does not
+    # overwrite X
+    rng = np.random.RandomState(42)
+    X = rng.rand(6, 6)
+    y = rng.rand(6)
+
+    X_before = X.copy()
+    linear_model.lars_path(X, y, Gram="auto", copy_X=True, method="lasso")
+    # X did not change
+    assert_allclose(X, X_before)
+
+
+@pytest.mark.parametrize(
+    "LARS, has_coef_path, args",
+    (
+        (Lars, True, {}),
+        (LassoLars, True, {}),
+        (LassoLarsIC, False, {}),
+        (LarsCV, True, {}),
+        # max_iter=5 is for avoiding ConvergenceWarning
+        (LassoLarsCV, True, {"max_iter": 5}),
+    ),
+)
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_lars_dtype_match(LARS, has_coef_path, args, dtype):
+    # The test ensures that the fit method preserves input dtype
+    rng = np.random.RandomState(0)
+    X = rng.rand(20, 6).astype(dtype)
+    y = rng.rand(20).astype(dtype)
+
+    model = LARS(**args)
+    model.fit(X, y)
+    assert model.coef_.dtype == dtype
+    if has_coef_path:
+        assert model.coef_path_.dtype == dtype
+    assert model.intercept_.dtype == dtype
+
+
+@pytest.mark.parametrize(
+    "LARS, has_coef_path, args",
+    (
+        (Lars, True, {}),
+        (LassoLars, True, {}),
+        (LassoLarsIC, False, {}),
+        (LarsCV, True, {}),
+        # max_iter=5 is for avoiding ConvergenceWarning
+        (LassoLarsCV, True, {"max_iter": 5}),
+    ),
+)
+def test_lars_numeric_consistency(LARS, has_coef_path, args):
+    # The test ensures numerical consistency between trained coefficients
+    # of float32 and float64.
+    rtol = 1e-5
+    atol = 1e-5
+
+    rng = np.random.RandomState(0)
+    X_64 = rng.rand(10, 6)
+    y_64 = rng.rand(10)
+
+    model_64 = LARS(**args).fit(X_64, y_64)
+    model_32 = LARS(**args).fit(X_64.astype(np.float32), y_64.astype(np.float32))
+
+    assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol)
+    if has_coef_path:
+        assert_allclose(model_64.coef_path_, model_32.coef_path_, rtol=rtol, atol=atol)
+    assert_allclose(model_64.intercept_, model_32.intercept_, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("criterion", ["aic", "bic"])
+def test_lassolarsic_alpha_selection(criterion):
+    """Check that we properly compute the AIC and BIC score.
+
+    In this test, we reproduce the example of the Fig. 2 of Zou et al.
+    (reference [1] in LassoLarsIC) In this example, only 7 features should be
+    selected.
+    """
+    model = make_pipeline(StandardScaler(), LassoLarsIC(criterion=criterion))
+    model.fit(X, y)
+
+    best_alpha_selected = np.argmin(model[-1].criterion_)
+    assert best_alpha_selected == 7
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_lassolarsic_noise_variance(fit_intercept):
+    """Check the behaviour when `n_samples` < `n_features` and that one needs
+    to provide the noise variance."""
+    rng = np.random.RandomState(0)
+    X, y = datasets.make_regression(
+        n_samples=10, n_features=11 - fit_intercept, random_state=rng
+    )
+
+    model = make_pipeline(StandardScaler(), LassoLarsIC(fit_intercept=fit_intercept))
+
+    err_msg = (
+        "You are using LassoLarsIC in the case where the number of samples is smaller"
+        " than the number of features"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        model.fit(X, y)
+
+    model.set_params(lassolarsic__noise_variance=1.0)
+    model.fit(X, y).predict(X)
@@ -0,0 +1,357 @@
+"""
+Tests for LinearModelLoss
+
+Note that correctness of losses (which compose LinearModelLoss) is already well
+covered in the _loss module.
+"""
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+from scipy import linalg, optimize
+
+from sklearn._loss.loss import (
+    HalfBinomialLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+)
+from sklearn.datasets import make_low_rank_matrix
+from sklearn.linear_model._linear_loss import LinearModelLoss
+from sklearn.utils.extmath import squared_norm
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+# We do not need to test all losses, just what LinearModelLoss does on top of the
+# base losses.
+LOSSES = [HalfBinomialLoss, HalfMultinomialLoss, HalfPoissonLoss]
+
+
+def random_X_y_coef(
+    linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42
+):
+    """Random generate y, X and coef in valid range."""
+    rng = np.random.RandomState(seed)
+    n_dof = n_features + linear_model_loss.fit_intercept
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        random_state=rng,
+    )
+    coef = linear_model_loss.init_zero_coef(X)
+
+    if linear_model_loss.base_loss.is_multiclass:
+        n_classes = linear_model_loss.base_loss.n_classes
+        coef.flat[:] = rng.uniform(
+            low=coef_bound[0],
+            high=coef_bound[1],
+            size=n_classes * n_dof,
+        )
+        if linear_model_loss.fit_intercept:
+            raw_prediction = X @ coef[:, :-1].T + coef[:, -1]
+        else:
+            raw_prediction = X @ coef.T
+        proba = linear_model_loss.base_loss.link.inverse(raw_prediction)
+
+        # y = rng.choice(np.arange(n_classes), p=proba) does not work.
+        # See https://stackoverflow.com/a/34190035/16761084
+        def choice_vectorized(items, p):
+            s = p.cumsum(axis=1)
+            r = rng.rand(p.shape[0])[:, None]
+            k = (s < r).sum(axis=1)
+            return items[k]
+
+        y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64)
+    else:
+        coef.flat[:] = rng.uniform(
+            low=coef_bound[0],
+            high=coef_bound[1],
+            size=n_dof,
+        )
+        if linear_model_loss.fit_intercept:
+            raw_prediction = X @ coef[:-1] + coef[-1]
+        else:
+            raw_prediction = X @ coef
+        y = linear_model_loss.base_loss.link.inverse(
+            raw_prediction + rng.uniform(low=-1, high=1, size=n_samples)
+        )
+
+    return X, y, coef
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("n_features", [0, 1, 10])
+@pytest.mark.parametrize("dtype", [None, np.float32, np.float64, np.int64])
+def test_init_zero_coef(base_loss, fit_intercept, n_features, dtype):
+    """Test that init_zero_coef initializes coef correctly."""
+    loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
+    rng = np.random.RandomState(42)
+    X = rng.normal(size=(5, n_features))
+    coef = loss.init_zero_coef(X, dtype=dtype)
+    if loss.base_loss.is_multiclass:
+        n_classes = loss.base_loss.n_classes
+        assert coef.shape == (n_classes, n_features + fit_intercept)
+        assert coef.flags["F_CONTIGUOUS"]
+    else:
+        assert coef.shape == (n_features + fit_intercept,)
+
+    if dtype is None:
+        assert coef.dtype == X.dtype
+    else:
+        assert coef.dtype == dtype
+
+    assert np.count_nonzero(coef) == 0
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1])
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_loss_grad_hess_are_the_same(
+    base_loss, fit_intercept, sample_weight, l2_reg_strength, csr_container
+):
+    """Test that loss and gradient are the same across different functions."""
+    loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss, n_samples=10, n_features=5, seed=42
+    )
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    l1 = loss.loss(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g1 = loss.gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    l2, g2 = loss.loss_gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g3, h3 = loss.gradient_hessian_product(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    if not base_loss.is_multiclass:
+        g4, h4, _ = loss.gradient_hessian(
+            coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+        )
+    else:
+        with pytest.raises(NotImplementedError):
+            loss.gradient_hessian(
+                coef,
+                X,
+                y,
+                sample_weight=sample_weight,
+                l2_reg_strength=l2_reg_strength,
+            )
+
+    assert_allclose(l1, l2)
+    assert_allclose(g1, g2)
+    assert_allclose(g1, g3)
+    if not base_loss.is_multiclass:
+        assert_allclose(g1, g4)
+        assert_allclose(h4 @ g4, h3(g3))
+
+    # same for sparse X
+    X = csr_container(X)
+    l1_sp = loss.loss(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g1_sp = loss.gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    l2_sp, g2_sp = loss.loss_gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g3_sp, h3_sp = loss.gradient_hessian_product(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    if not base_loss.is_multiclass:
+        g4_sp, h4_sp, _ = loss.gradient_hessian(
+            coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+        )
+
+    assert_allclose(l1, l1_sp)
+    assert_allclose(l1, l2_sp)
+    assert_allclose(g1, g1_sp)
+    assert_allclose(g1, g2_sp)
+    assert_allclose(g1, g3_sp)
+    assert_allclose(h3(g1), h3_sp(g1_sp))
+    if not base_loss.is_multiclass:
+        assert_allclose(g1, g4_sp)
+        assert_allclose(h4 @ g4, h4_sp @ g1_sp)
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1])
+@pytest.mark.parametrize("X_container", CSR_CONTAINERS + [None])
+def test_loss_gradients_hessp_intercept(
+    base_loss, sample_weight, l2_reg_strength, X_container
+):
+    """Test that loss and gradient handle intercept correctly."""
+    loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=False)
+    loss_inter = LinearModelLoss(base_loss=base_loss(), fit_intercept=True)
+    n_samples, n_features = 10, 5
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
+    )
+
+    X[:, -1] = 1  # make last column of 1 to mimic intercept term
+    X_inter = X[
+        :, :-1
+    ]  # exclude intercept column as it is added automatically by loss_inter
+
+    if X_container is not None:
+        X = X_container(X)
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    l, g = loss.loss_gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    _, hessp = loss.gradient_hessian_product(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    l_inter, g_inter = loss_inter.loss_gradient(
+        coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    _, hessp_inter = loss_inter.gradient_hessian_product(
+        coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+
+    # Note, that intercept gets no L2 penalty.
+    assert l == pytest.approx(
+        l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1])
+    )
+
+    g_inter_corrected = g_inter
+    g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1]
+    assert_allclose(g, g_inter_corrected)
+
+    s = np.random.RandomState(42).randn(*coef.shape)
+    h = hessp(s)
+    h_inter = hessp_inter(s)
+    h_inter_corrected = h_inter
+    h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1]
+    assert_allclose(h, h_inter_corrected)
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1])
+def test_gradients_hessians_numerically(
+    base_loss, fit_intercept, sample_weight, l2_reg_strength
+):
+    """Test gradients and hessians with numerical derivatives.
+
+    Gradient should equal the numerical derivatives of the loss function.
+    Hessians should equal the numerical derivatives of gradients.
+    """
+    loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
+    n_samples, n_features = 10, 5
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
+    )
+    coef = coef.ravel(order="F")  # this is important only for multinomial loss
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    # 1. Check gradients numerically
+    eps = 1e-6
+    g, hessp = loss.gradient_hessian_product(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    # Use a trick to get central finite difference of accuracy 4 (five-point stencil)
+    # https://en.wikipedia.org/wiki/Numerical_differentiation
+    # https://en.wikipedia.org/wiki/Finite_difference_coefficient
+    # approx_g1 = (f(x + eps) - f(x - eps)) / (2*eps)
+    approx_g1 = optimize.approx_fprime(
+        coef,
+        lambda coef: loss.loss(
+            coef - eps,
+            X,
+            y,
+            sample_weight=sample_weight,
+            l2_reg_strength=l2_reg_strength,
+        ),
+        2 * eps,
+    )
+    # approx_g2 = (f(x + 2*eps) - f(x - 2*eps)) / (4*eps)
+    approx_g2 = optimize.approx_fprime(
+        coef,
+        lambda coef: loss.loss(
+            coef - 2 * eps,
+            X,
+            y,
+            sample_weight=sample_weight,
+            l2_reg_strength=l2_reg_strength,
+        ),
+        4 * eps,
+    )
+    # Five-point stencil approximation
+    # See: https://en.wikipedia.org/wiki/Five-point_stencil#1D_first_derivative
+    approx_g = (4 * approx_g1 - approx_g2) / 3
+    assert_allclose(g, approx_g, rtol=1e-2, atol=1e-8)
+
+    # 2. Check hessp numerically along the second direction of the gradient
+    vector = np.zeros_like(g)
+    vector[1] = 1
+    hess_col = hessp(vector)
+    # Computation of the Hessian is particularly fragile to numerical errors when doing
+    # simple finite differences. Here we compute the grad along a path in the direction
+    # of the vector and then use a least-square regression to estimate the slope
+    eps = 1e-3
+    d_x = np.linspace(-eps, eps, 30)
+    d_grad = np.array(
+        [
+            loss.gradient(
+                coef + t * vector,
+                X,
+                y,
+                sample_weight=sample_weight,
+                l2_reg_strength=l2_reg_strength,
+            )
+            for t in d_x
+        ]
+    )
+    d_grad -= d_grad.mean(axis=0)
+    approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel()
+    assert_allclose(approx_hess_col, hess_col, rtol=1e-3)
+
+
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_multinomial_coef_shape(fit_intercept):
+    """Test that multinomial LinearModelLoss respects shape of coef."""
+    loss = LinearModelLoss(base_loss=HalfMultinomialLoss(), fit_intercept=fit_intercept)
+    n_samples, n_features = 10, 5
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
+    )
+    s = np.random.RandomState(42).randn(*coef.shape)
+
+    l, g = loss.loss_gradient(coef, X, y)
+    g1 = loss.gradient(coef, X, y)
+    g2, hessp = loss.gradient_hessian_product(coef, X, y)
+    h = hessp(s)
+    assert g.shape == coef.shape
+    assert h.shape == coef.shape
+    assert_allclose(g, g1)
+    assert_allclose(g, g2)
+
+    coef_r = coef.ravel(order="F")
+    s_r = s.ravel(order="F")
+    l_r, g_r = loss.loss_gradient(coef_r, X, y)
+    g1_r = loss.gradient(coef_r, X, y)
+    g2_r, hessp_r = loss.gradient_hessian_product(coef_r, X, y)
+    h_r = hessp_r(s_r)
+    assert g_r.shape == coef_r.shape
+    assert h_r.shape == coef_r.shape
+    assert_allclose(g_r, g1_r)
+    assert_allclose(g_r, g2_r)
+
+    assert_allclose(g, g_r.reshape(loss.base_loss.n_classes, -1, order="F"))
+    assert_allclose(h, h_r.reshape(loss.base_loss.n_classes, -1, order="F"))
@@ -0,0 +1,273 @@
+# Author: Vlad Niculae
+# License: BSD 3 clause
+
+
+import numpy as np
+import pytest
+
+from sklearn.datasets import make_sparse_coded_signal
+from sklearn.linear_model import (
+    LinearRegression,
+    OrthogonalMatchingPursuit,
+    OrthogonalMatchingPursuitCV,
+    orthogonal_mp,
+    orthogonal_mp_gram,
+)
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+    ignore_warnings,
+)
+
+n_samples, n_features, n_nonzero_coefs, n_targets = 25, 35, 5, 3
+y, X, gamma = make_sparse_coded_signal(
+    n_samples=n_targets,
+    n_components=n_features,
+    n_features=n_samples,
+    n_nonzero_coefs=n_nonzero_coefs,
+    random_state=0,
+)
+y, X, gamma = y.T, X.T, gamma.T
+# Make X not of norm 1 for testing
+X *= 10
+y *= 10
+G, Xy = np.dot(X.T, X), np.dot(X.T, y)
+# this makes X (n_samples, n_features)
+# and y (n_samples, 3)
+
+
+def test_correct_shapes():
+    assert orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5).shape == (n_features,)
+    assert orthogonal_mp(X, y, n_nonzero_coefs=5).shape == (n_features, 3)
+
+
+def test_correct_shapes_gram():
+    assert orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5).shape == (n_features,)
+    assert orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5).shape == (n_features, 3)
+
+
+def test_n_nonzero_coefs():
+    assert np.count_nonzero(orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5)) <= 5
+    assert (
+        np.count_nonzero(orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5, precompute=True))
+        <= 5
+    )
+
+
+def test_tol():
+    tol = 0.5
+    gamma = orthogonal_mp(X, y[:, 0], tol=tol)
+    gamma_gram = orthogonal_mp(X, y[:, 0], tol=tol, precompute=True)
+    assert np.sum((y[:, 0] - np.dot(X, gamma)) ** 2) <= tol
+    assert np.sum((y[:, 0] - np.dot(X, gamma_gram)) ** 2) <= tol
+
+
+def test_with_without_gram():
+    assert_array_almost_equal(
+        orthogonal_mp(X, y, n_nonzero_coefs=5),
+        orthogonal_mp(X, y, n_nonzero_coefs=5, precompute=True),
+    )
+
+
+def test_with_without_gram_tol():
+    assert_array_almost_equal(
+        orthogonal_mp(X, y, tol=1.0), orthogonal_mp(X, y, tol=1.0, precompute=True)
+    )
+
+
+def test_unreachable_accuracy():
+    assert_array_almost_equal(
+        orthogonal_mp(X, y, tol=0), orthogonal_mp(X, y, n_nonzero_coefs=n_features)
+    )
+    warning_message = (
+        "Orthogonal matching pursuit ended prematurely "
+        "due to linear dependence in the dictionary. "
+        "The requested precision might not have been met."
+    )
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        assert_array_almost_equal(
+            orthogonal_mp(X, y, tol=0, precompute=True),
+            orthogonal_mp(X, y, precompute=True, n_nonzero_coefs=n_features),
+        )
+
+
+@pytest.mark.parametrize("positional_params", [(X, y), (G, Xy)])
+@pytest.mark.parametrize(
+    "keyword_params",
+    [{"n_nonzero_coefs": n_features + 1}],
+)
+def test_bad_input(positional_params, keyword_params):
+    with pytest.raises(ValueError):
+        orthogonal_mp(*positional_params, **keyword_params)
+
+
+def test_perfect_signal_recovery():
+    (idx,) = gamma[:, 0].nonzero()
+    gamma_rec = orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5)
+    gamma_gram = orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5)
+    assert_array_equal(idx, np.flatnonzero(gamma_rec))
+    assert_array_equal(idx, np.flatnonzero(gamma_gram))
+    assert_array_almost_equal(gamma[:, 0], gamma_rec, decimal=2)
+    assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)
+
+
+def test_orthogonal_mp_gram_readonly():
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/5956
+    (idx,) = gamma[:, 0].nonzero()
+    G_readonly = G.copy()
+    G_readonly.setflags(write=False)
+    Xy_readonly = Xy.copy()
+    Xy_readonly.setflags(write=False)
+    gamma_gram = orthogonal_mp_gram(
+        G_readonly, Xy_readonly[:, 0], n_nonzero_coefs=5, copy_Gram=False, copy_Xy=False
+    )
+    assert_array_equal(idx, np.flatnonzero(gamma_gram))
+    assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)
+
+
+def test_estimator():
+    omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs)
+    omp.fit(X, y[:, 0])
+    assert omp.coef_.shape == (n_features,)
+    assert omp.intercept_.shape == ()
+    assert np.count_nonzero(omp.coef_) <= n_nonzero_coefs
+
+    omp.fit(X, y)
+    assert omp.coef_.shape == (n_targets, n_features)
+    assert omp.intercept_.shape == (n_targets,)
+    assert np.count_nonzero(omp.coef_) <= n_targets * n_nonzero_coefs
+
+    coef_normalized = omp.coef_[0].copy()
+    omp.set_params(fit_intercept=True)
+    omp.fit(X, y[:, 0])
+    assert_array_almost_equal(coef_normalized, omp.coef_)
+
+    omp.set_params(fit_intercept=False)
+    omp.fit(X, y[:, 0])
+    assert np.count_nonzero(omp.coef_) <= n_nonzero_coefs
+    assert omp.coef_.shape == (n_features,)
+    assert omp.intercept_ == 0
+
+    omp.fit(X, y)
+    assert omp.coef_.shape == (n_targets, n_features)
+    assert omp.intercept_ == 0
+    assert np.count_nonzero(omp.coef_) <= n_targets * n_nonzero_coefs
+
+
+def test_estimator_n_nonzero_coefs():
+    """Check `n_nonzero_coefs_` correct when `tol` is and isn't set."""
+    omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs)
+    omp.fit(X, y[:, 0])
+    assert omp.n_nonzero_coefs_ == n_nonzero_coefs
+
+    omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs, tol=0.5)
+    omp.fit(X, y[:, 0])
+    assert omp.n_nonzero_coefs_ is None
+
+
+def test_identical_regressors():
+    newX = X.copy()
+    newX[:, 1] = newX[:, 0]
+    gamma = np.zeros(n_features)
+    gamma[0] = gamma[1] = 1.0
+    newy = np.dot(newX, gamma)
+    warning_message = (
+        "Orthogonal matching pursuit ended prematurely "
+        "due to linear dependence in the dictionary. "
+        "The requested precision might not have been met."
+    )
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        orthogonal_mp(newX, newy, n_nonzero_coefs=2)
+
+
+def test_swapped_regressors():
+    gamma = np.zeros(n_features)
+    # X[:, 21] should be selected first, then X[:, 0] selected second,
+    # which will take X[:, 21]'s place in case the algorithm does
+    # column swapping for optimization (which is the case at the moment)
+    gamma[21] = 1.0
+    gamma[0] = 0.5
+    new_y = np.dot(X, gamma)
+    new_Xy = np.dot(X.T, new_y)
+    gamma_hat = orthogonal_mp(X, new_y, n_nonzero_coefs=2)
+    gamma_hat_gram = orthogonal_mp_gram(G, new_Xy, n_nonzero_coefs=2)
+    assert_array_equal(np.flatnonzero(gamma_hat), [0, 21])
+    assert_array_equal(np.flatnonzero(gamma_hat_gram), [0, 21])
+
+
+def test_no_atoms():
+    y_empty = np.zeros_like(y)
+    Xy_empty = np.dot(X.T, y_empty)
+    gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty, n_nonzero_coefs=1)
+    gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty, n_nonzero_coefs=1)
+    assert np.all(gamma_empty == 0)
+    assert np.all(gamma_empty_gram == 0)
+
+
+def test_omp_path():
+    path = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=True)
+    last = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=False)
+    assert path.shape == (n_features, n_targets, 5)
+    assert_array_almost_equal(path[:, :, -1], last)
+    path = orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5, return_path=True)
+    last = orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5, return_path=False)
+    assert path.shape == (n_features, n_targets, 5)
+    assert_array_almost_equal(path[:, :, -1], last)
+
+
+def test_omp_return_path_prop_with_gram():
+    path = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=True, precompute=True)
+    last = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=False, precompute=True)
+    assert path.shape == (n_features, n_targets, 5)
+    assert_array_almost_equal(path[:, :, -1], last)
+
+
+def test_omp_cv():
+    y_ = y[:, 0]
+    gamma_ = gamma[:, 0]
+    ompcv = OrthogonalMatchingPursuitCV(fit_intercept=False, max_iter=10)
+    ompcv.fit(X, y_)
+    assert ompcv.n_nonzero_coefs_ == n_nonzero_coefs
+    assert_array_almost_equal(ompcv.coef_, gamma_)
+    omp = OrthogonalMatchingPursuit(
+        fit_intercept=False, n_nonzero_coefs=ompcv.n_nonzero_coefs_
+    )
+    omp.fit(X, y_)
+    assert_array_almost_equal(ompcv.coef_, omp.coef_)
+
+
+def test_omp_reaches_least_squares():
+    # Use small simple data; it's a sanity check but OMP can stop early
+    rng = check_random_state(0)
+    n_samples, n_features = (10, 8)
+    n_targets = 3
+    X = rng.randn(n_samples, n_features)
+    Y = rng.randn(n_samples, n_targets)
+    omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_features)
+    lstsq = LinearRegression()
+    omp.fit(X, Y)
+    lstsq.fit(X, Y)
+    assert_array_almost_equal(omp.coef_, lstsq.coef_)
+
+
+@pytest.mark.parametrize("data_type", (np.float32, np.float64))
+def test_omp_gram_dtype_match(data_type):
+    # verify matching input data type and output data type
+    coef = orthogonal_mp_gram(
+        G.astype(data_type), Xy.astype(data_type), n_nonzero_coefs=5
+    )
+    assert coef.dtype == data_type
+
+
+def test_omp_gram_numerical_consistency():
+    # verify numericaly consistency among np.float32 and np.float64
+    coef_32 = orthogonal_mp_gram(
+        G.astype(np.float32), Xy.astype(np.float32), n_nonzero_coefs=5
+    )
+    coef_64 = orthogonal_mp_gram(
+        G.astype(np.float32), Xy.astype(np.float64), n_nonzero_coefs=5
+    )
+    assert_allclose(coef_32, coef_64)
@@ -0,0 +1,278 @@
+import numpy as np
+import pytest
+
+from sklearn.base import ClassifierMixin
+from sklearn.datasets import load_iris
+from sklearn.linear_model import PassiveAggressiveClassifier, PassiveAggressiveRegressor
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+iris = load_iris()
+random_state = check_random_state(12)
+indices = np.arange(iris.data.shape[0])
+random_state.shuffle(indices)
+X = iris.data[indices]
+y = iris.target[indices]
+
+
+class MyPassiveAggressive(ClassifierMixin):
+    def __init__(
+        self,
+        C=1.0,
+        epsilon=0.01,
+        loss="hinge",
+        fit_intercept=True,
+        n_iter=1,
+        random_state=None,
+    ):
+        self.C = C
+        self.epsilon = epsilon
+        self.loss = loss
+        self.fit_intercept = fit_intercept
+        self.n_iter = n_iter
+
+    def fit(self, X, y):
+        n_samples, n_features = X.shape
+        self.w = np.zeros(n_features, dtype=np.float64)
+        self.b = 0.0
+
+        for t in range(self.n_iter):
+            for i in range(n_samples):
+                p = self.project(X[i])
+                if self.loss in ("hinge", "squared_hinge"):
+                    loss = max(1 - y[i] * p, 0)
+                else:
+                    loss = max(np.abs(p - y[i]) - self.epsilon, 0)
+
+                sqnorm = np.dot(X[i], X[i])
+
+                if self.loss in ("hinge", "epsilon_insensitive"):
+                    step = min(self.C, loss / sqnorm)
+                elif self.loss in ("squared_hinge", "squared_epsilon_insensitive"):
+                    step = loss / (sqnorm + 1.0 / (2 * self.C))
+
+                if self.loss in ("hinge", "squared_hinge"):
+                    step *= y[i]
+                else:
+                    step *= np.sign(y[i] - p)
+
+                self.w += step * X[i]
+                if self.fit_intercept:
+                    self.b += step
+
+    def project(self, X):
+        return np.dot(X, self.w) + self.b
+
+
+@pytest.mark.parametrize("average", [False, True])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+def test_classifier_accuracy(csr_container, fit_intercept, average):
+    data = csr_container(X) if csr_container is not None else X
+    clf = PassiveAggressiveClassifier(
+        C=1.0,
+        max_iter=30,
+        fit_intercept=fit_intercept,
+        random_state=1,
+        average=average,
+        tol=None,
+    )
+    clf.fit(data, y)
+    score = clf.score(data, y)
+    assert score > 0.79
+    if average:
+        assert hasattr(clf, "_average_coef")
+        assert hasattr(clf, "_average_intercept")
+        assert hasattr(clf, "_standard_intercept")
+        assert hasattr(clf, "_standard_coef")
+
+
+@pytest.mark.parametrize("average", [False, True])
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+def test_classifier_partial_fit(csr_container, average):
+    classes = np.unique(y)
+    data = csr_container(X) if csr_container is not None else X
+    clf = PassiveAggressiveClassifier(random_state=0, average=average, max_iter=5)
+    for t in range(30):
+        clf.partial_fit(data, y, classes)
+    score = clf.score(data, y)
+    assert score > 0.79
+    if average:
+        assert hasattr(clf, "_average_coef")
+        assert hasattr(clf, "_average_intercept")
+        assert hasattr(clf, "_standard_intercept")
+        assert hasattr(clf, "_standard_coef")
+
+
+def test_classifier_refit():
+    # Classifier can be retrained on different labels and features.
+    clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y)
+    assert_array_equal(clf.classes_, np.unique(y))
+
+    clf.fit(X[:, :-1], iris.target_names[y])
+    assert_array_equal(clf.classes_, iris.target_names)
+
+
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+@pytest.mark.parametrize("loss", ("hinge", "squared_hinge"))
+def test_classifier_correctness(loss, csr_container):
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    clf1 = MyPassiveAggressive(loss=loss, n_iter=2)
+    clf1.fit(X, y_bin)
+
+    data = csr_container(X) if csr_container is not None else X
+    clf2 = PassiveAggressiveClassifier(loss=loss, max_iter=2, shuffle=False, tol=None)
+    clf2.fit(data, y_bin)
+
+    assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
+
+
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "predict_log_proba", "transform"]
+)
+def test_classifier_undefined_methods(response_method):
+    clf = PassiveAggressiveClassifier(max_iter=100)
+    with pytest.raises(AttributeError):
+        getattr(clf, response_method)
+
+
+def test_class_weights():
+    # Test class weights.
+    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y2 = [1, 1, 1, -1, -1]
+
+    clf = PassiveAggressiveClassifier(
+        C=0.1, max_iter=100, class_weight=None, random_state=100
+    )
+    clf.fit(X2, y2)
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
+
+    # we give a small weights to class 1
+    clf = PassiveAggressiveClassifier(
+        C=0.1, max_iter=100, class_weight={1: 0.001}, random_state=100
+    )
+    clf.fit(X2, y2)
+
+    # now the hyperplane should rotate clock-wise and
+    # the prediction on this point should shift
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
+
+
+def test_partial_fit_weight_class_balanced():
+    # partial_fit with class_weight='balanced' not supported
+    clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100)
+    with pytest.raises(ValueError):
+        clf.partial_fit(X, y, classes=np.unique(y))
+
+
+def test_equal_class_weight():
+    X2 = [[1, 0], [1, 0], [0, 1], [0, 1]]
+    y2 = [0, 0, 1, 1]
+    clf = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight=None)
+    clf.fit(X2, y2)
+
+    # Already balanced, so "balanced" weights should have no effect
+    clf_balanced = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight="balanced")
+    clf_balanced.fit(X2, y2)
+
+    clf_weighted = PassiveAggressiveClassifier(
+        C=0.1, tol=None, class_weight={0: 0.5, 1: 0.5}
+    )
+    clf_weighted.fit(X2, y2)
+
+    # should be similar up to some epsilon due to learning rate schedule
+    assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
+    assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
+
+
+def test_wrong_class_weight_label():
+    # ValueError due to wrong class_weight label.
+    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
+    y2 = [1, 1, 1, -1, -1]
+
+    clf = PassiveAggressiveClassifier(class_weight={0: 0.5}, max_iter=100)
+    with pytest.raises(ValueError):
+        clf.fit(X2, y2)
+
+
+@pytest.mark.parametrize("average", [False, True])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+def test_regressor_mse(csr_container, fit_intercept, average):
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    data = csr_container(X) if csr_container is not None else X
+    reg = PassiveAggressiveRegressor(
+        C=1.0,
+        fit_intercept=fit_intercept,
+        random_state=0,
+        average=average,
+        max_iter=5,
+    )
+    reg.fit(data, y_bin)
+    pred = reg.predict(data)
+    assert np.mean((pred - y_bin) ** 2) < 1.7
+    if average:
+        assert hasattr(reg, "_average_coef")
+        assert hasattr(reg, "_average_intercept")
+        assert hasattr(reg, "_standard_intercept")
+        assert hasattr(reg, "_standard_coef")
+
+
+@pytest.mark.parametrize("average", [False, True])
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+def test_regressor_partial_fit(csr_container, average):
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    data = csr_container(X) if csr_container is not None else X
+    reg = PassiveAggressiveRegressor(random_state=0, average=average, max_iter=100)
+    for t in range(50):
+        reg.partial_fit(data, y_bin)
+    pred = reg.predict(data)
+    assert np.mean((pred - y_bin) ** 2) < 1.7
+    if average:
+        assert hasattr(reg, "_average_coef")
+        assert hasattr(reg, "_average_intercept")
+        assert hasattr(reg, "_standard_intercept")
+        assert hasattr(reg, "_standard_coef")
+
+
+@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
+@pytest.mark.parametrize("loss", ("epsilon_insensitive", "squared_epsilon_insensitive"))
+def test_regressor_correctness(loss, csr_container):
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    reg1 = MyPassiveAggressive(loss=loss, n_iter=2)
+    reg1.fit(X, y_bin)
+
+    data = csr_container(X) if csr_container is not None else X
+    reg2 = PassiveAggressiveRegressor(tol=None, loss=loss, max_iter=2, shuffle=False)
+    reg2.fit(data, y_bin)
+
+    assert_array_almost_equal(reg1.w, reg2.coef_.ravel(), decimal=2)
+
+
+def test_regressor_undefined_methods():
+    reg = PassiveAggressiveRegressor(max_iter=100)
+    with pytest.raises(AttributeError):
+        reg.transform(X)
+
+
+# TODO(1.7): remove
+@pytest.mark.parametrize(
+    "Estimator", [PassiveAggressiveClassifier, PassiveAggressiveRegressor]
+)
+def test_passive_aggressive_deprecated_average(Estimator):
+    est = Estimator(average=0)
+    with pytest.warns(FutureWarning, match="average=0"):
+        est.fit(X, y)
@@ -0,0 +1,88 @@
+import numpy as np
+import pytest
+
+from sklearn.datasets import load_iris
+from sklearn.linear_model import Perceptron
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_allclose, assert_array_almost_equal
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+iris = load_iris()
+random_state = check_random_state(12)
+indices = np.arange(iris.data.shape[0])
+random_state.shuffle(indices)
+X = iris.data[indices]
+y = iris.target[indices]
+
+
+class MyPerceptron:
+    def __init__(self, n_iter=1):
+        self.n_iter = n_iter
+
+    def fit(self, X, y):
+        n_samples, n_features = X.shape
+        self.w = np.zeros(n_features, dtype=np.float64)
+        self.b = 0.0
+
+        for t in range(self.n_iter):
+            for i in range(n_samples):
+                if self.predict(X[i])[0] != y[i]:
+                    self.w += y[i] * X[i]
+                    self.b += y[i]
+
+    def project(self, X):
+        return np.dot(X, self.w) + self.b
+
+    def predict(self, X):
+        X = np.atleast_2d(X)
+        return np.sign(self.project(X))
+
+
+@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array])
+def test_perceptron_accuracy(container):
+    data = container(X)
+    clf = Perceptron(max_iter=100, tol=None, shuffle=False)
+    clf.fit(data, y)
+    score = clf.score(data, y)
+    assert score > 0.7
+
+
+def test_perceptron_correctness():
+    y_bin = y.copy()
+    y_bin[y != 1] = -1
+
+    clf1 = MyPerceptron(n_iter=2)
+    clf1.fit(X, y_bin)
+
+    clf2 = Perceptron(max_iter=2, shuffle=False, tol=None)
+    clf2.fit(X, y_bin)
+
+    assert_array_almost_equal(clf1.w, clf2.coef_.ravel())
+
+
+def test_undefined_methods():
+    clf = Perceptron(max_iter=100)
+    for meth in ("predict_proba", "predict_log_proba"):
+        with pytest.raises(AttributeError):
+            getattr(clf, meth)
+
+
+def test_perceptron_l1_ratio():
+    """Check that `l1_ratio` has an impact when `penalty='elasticnet'`"""
+    clf1 = Perceptron(l1_ratio=0, penalty="elasticnet")
+    clf1.fit(X, y)
+
+    clf2 = Perceptron(l1_ratio=0.15, penalty="elasticnet")
+    clf2.fit(X, y)
+
+    assert clf1.score(X, y) != clf2.score(X, y)
+
+    # check that the bounds of elastic net which should correspond to an l1 or
+    # l2 penalty depending of `l1_ratio` value.
+    clf_l1 = Perceptron(penalty="l1").fit(X, y)
+    clf_elasticnet = Perceptron(l1_ratio=1, penalty="elasticnet").fit(X, y)
+    assert_allclose(clf_l1.coef_, clf_elasticnet.coef_)
+
+    clf_l2 = Perceptron(penalty="l2").fit(X, y)
+    clf_elasticnet = Perceptron(l1_ratio=0, penalty="elasticnet").fit(X, y)
+    assert_allclose(clf_l2.coef_, clf_elasticnet.coef_)
@@ -0,0 +1,306 @@
+# Authors: David Dale <dale.david@mail.ru>
+#          Christian Lorentzen <lorentzen.ch@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+import pytest
+from pytest import approx
+from scipy.optimize import minimize
+
+from sklearn.datasets import make_regression
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import HuberRegressor, QuantileRegressor
+from sklearn.metrics import mean_pinball_loss
+from sklearn.utils._testing import assert_allclose, skip_if_32bit
+from sklearn.utils.fixes import (
+    COO_CONTAINERS,
+    CSC_CONTAINERS,
+    CSR_CONTAINERS,
+    parse_version,
+    sp_version,
+)
+
+
+@pytest.fixture
+def X_y_data():
+    X, y = make_regression(n_samples=10, n_features=1, random_state=0, noise=1)
+    return X, y
+
+
+@pytest.fixture
+def default_solver():
+    return "highs" if sp_version >= parse_version("1.6.0") else "interior-point"
+
+
+@pytest.mark.skipif(
+    parse_version(sp_version.base_version) >= parse_version("1.11"),
+    reason="interior-point solver is not available in SciPy 1.11",
+)
+@pytest.mark.parametrize("solver", ["interior-point", "revised simplex"])
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_incompatible_solver_for_sparse_input(X_y_data, solver, csc_container):
+    X, y = X_y_data
+    X_sparse = csc_container(X)
+    err_msg = (
+        f"Solver {solver} does not support sparse X. Use solver 'highs' for example."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        QuantileRegressor(solver=solver).fit(X_sparse, y)
+
+
+@pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs"))
+@pytest.mark.skipif(
+    sp_version >= parse_version("1.6.0"),
+    reason="Solvers are available as of scipy 1.6.0",
+)
+def test_too_new_solver_methods_raise_error(X_y_data, solver):
+    """Test that highs solver raises for scipy<1.6.0."""
+    X, y = X_y_data
+    with pytest.raises(ValueError, match="scipy>=1.6.0"):
+        QuantileRegressor(solver=solver).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "quantile, alpha, intercept, coef",
+    [
+        # for 50% quantile w/o regularization, any slope in [1, 10] is okay
+        [0.5, 0, 1, None],
+        # if positive error costs more, the slope is maximal
+        [0.51, 0, 1, 10],
+        # if negative error costs more, the slope is minimal
+        [0.49, 0, 1, 1],
+        # for a small lasso penalty, the slope is also minimal
+        [0.5, 0.01, 1, 1],
+        # for a large lasso penalty, the model predicts the constant median
+        [0.5, 100, 2, 0],
+    ],
+)
+def test_quantile_toy_example(quantile, alpha, intercept, coef, default_solver):
+    # test how different parameters affect a small intuitive example
+    X = [[0], [1], [1]]
+    y = [1, 2, 11]
+    model = QuantileRegressor(
+        quantile=quantile, alpha=alpha, solver=default_solver
+    ).fit(X, y)
+    assert_allclose(model.intercept_, intercept, atol=1e-2)
+    if coef is not None:
+        assert_allclose(model.coef_[0], coef, atol=1e-2)
+    if alpha < 100:
+        assert model.coef_[0] >= 1
+    assert model.coef_[0] <= 10
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_quantile_equals_huber_for_low_epsilon(fit_intercept, default_solver):
+    X, y = make_regression(n_samples=100, n_features=20, random_state=0, noise=1.0)
+    alpha = 1e-4
+    huber = HuberRegressor(
+        epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept
+    ).fit(X, y)
+    quant = QuantileRegressor(
+        alpha=alpha, fit_intercept=fit_intercept, solver=default_solver
+    ).fit(X, y)
+    assert_allclose(huber.coef_, quant.coef_, atol=1e-1)
+    if fit_intercept:
+        assert huber.intercept_ == approx(quant.intercept_, abs=1e-1)
+        # check that we still predict fraction
+        assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1)
+
+
+@pytest.mark.parametrize("q", [0.5, 0.9, 0.05])
+def test_quantile_estimates_calibration(q, default_solver):
+    # Test that model estimates percentage of points below the prediction
+    X, y = make_regression(n_samples=1000, n_features=20, random_state=0, noise=1.0)
+    quant = QuantileRegressor(
+        quantile=q,
+        alpha=0,
+        solver=default_solver,
+    ).fit(X, y)
+    assert np.mean(y < quant.predict(X)) == approx(q, abs=1e-2)
+
+
+def test_quantile_sample_weight(default_solver):
+    # test that with unequal sample weights we still estimate weighted fraction
+    n = 1000
+    X, y = make_regression(n_samples=n, n_features=5, random_state=0, noise=10.0)
+    weight = np.ones(n)
+    # when we increase weight of upper observations,
+    # estimate of quantile should go up
+    weight[y > y.mean()] = 100
+    quant = QuantileRegressor(quantile=0.5, alpha=1e-8, solver=default_solver)
+    quant.fit(X, y, sample_weight=weight)
+    fraction_below = np.mean(y < quant.predict(X))
+    assert fraction_below > 0.5
+    weighted_fraction_below = np.average(y < quant.predict(X), weights=weight)
+    assert weighted_fraction_below == approx(0.5, abs=3e-2)
+
+
+@pytest.mark.skipif(
+    sp_version < parse_version("1.6.0"),
+    reason="The `highs` solver is available from the 1.6.0 scipy version",
+)
+@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
+def test_asymmetric_error(quantile, default_solver):
+    """Test quantile regression for asymmetric distributed targets."""
+    n_samples = 1000
+    rng = np.random.RandomState(42)
+    X = np.concatenate(
+        (
+            np.abs(rng.randn(n_samples)[:, None]),
+            -rng.randint(2, size=(n_samples, 1)),
+        ),
+        axis=1,
+    )
+    intercept = 1.23
+    coef = np.array([0.5, -2])
+    #  Take care that X @ coef + intercept > 0
+    assert np.min(X @ coef + intercept) > 0
+    # For an exponential distribution with rate lambda, e.g. exp(-lambda * x),
+    # the quantile at level q is:
+    #   quantile(q) = - log(1 - q) / lambda
+    #   scale = 1/lambda = -quantile(q) / log(1 - q)
+    y = rng.exponential(
+        scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples
+    )
+    model = QuantileRegressor(
+        quantile=quantile,
+        alpha=0,
+        solver=default_solver,
+    ).fit(X, y)
+    # This test can be made to pass with any solver but in the interest
+    # of sparing continuous integration resources, the test is performed
+    # with the fastest solver only.
+
+    assert model.intercept_ == approx(intercept, rel=0.2)
+    assert_allclose(model.coef_, coef, rtol=0.6)
+    assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2)
+
+    # Now compare to Nelder-Mead optimization with L1 penalty
+    alpha = 0.01
+    model.set_params(alpha=alpha).fit(X, y)
+    model_coef = np.r_[model.intercept_, model.coef_]
+
+    def func(coef):
+        loss = mean_pinball_loss(y, X @ coef[1:] + coef[0], alpha=quantile)
+        L1 = np.sum(np.abs(coef[1:]))
+        return loss + alpha * L1
+
+    res = minimize(
+        fun=func,
+        x0=[1, 0, -1],
+        method="Nelder-Mead",
+        tol=1e-12,
+        options={"maxiter": 2000},
+    )
+
+    assert func(model_coef) == approx(func(res.x))
+    assert_allclose(model.intercept_, res.x[0])
+    assert_allclose(model.coef_, res.x[1:])
+    assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2)
+
+
+@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
+def test_equivariance(quantile, default_solver):
+    """Test equivariace of quantile regression.
+
+    See Koenker (2005) Quantile Regression, Chapter 2.2.3.
+    """
+    rng = np.random.RandomState(42)
+    n_samples, n_features = 100, 5
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_features,
+        noise=0,
+        random_state=rng,
+        shuffle=False,
+    )
+    # make y asymmetric
+    y += rng.exponential(scale=100, size=y.shape)
+    params = dict(alpha=0, solver=default_solver)
+    model1 = QuantileRegressor(quantile=quantile, **params).fit(X, y)
+
+    # coef(q; a*y, X) = a * coef(q; y, X)
+    a = 2.5
+    model2 = QuantileRegressor(quantile=quantile, **params).fit(X, a * y)
+    assert model2.intercept_ == approx(a * model1.intercept_, rel=1e-5)
+    assert_allclose(model2.coef_, a * model1.coef_, rtol=1e-5)
+
+    # coef(1-q; -a*y, X) = -a * coef(q; y, X)
+    model2 = QuantileRegressor(quantile=1 - quantile, **params).fit(X, -a * y)
+    assert model2.intercept_ == approx(-a * model1.intercept_, rel=1e-5)
+    assert_allclose(model2.coef_, -a * model1.coef_, rtol=1e-5)
+
+    # coef(q; y + X @ g, X) = coef(q; y, X) + g
+    g_intercept, g_coef = rng.randn(), rng.randn(n_features)
+    model2 = QuantileRegressor(quantile=quantile, **params)
+    model2.fit(X, y + X @ g_coef + g_intercept)
+    assert model2.intercept_ == approx(model1.intercept_ + g_intercept)
+    assert_allclose(model2.coef_, model1.coef_ + g_coef, rtol=1e-6)
+
+    # coef(q; y, X @ A) = A^-1 @ coef(q; y, X)
+    A = rng.randn(n_features, n_features)
+    model2 = QuantileRegressor(quantile=quantile, **params)
+    model2.fit(X @ A, y)
+    assert model2.intercept_ == approx(model1.intercept_, rel=1e-5)
+    assert_allclose(model2.coef_, np.linalg.solve(A, model1.coef_), rtol=1e-5)
+
+
+@pytest.mark.skipif(
+    parse_version(sp_version.base_version) >= parse_version("1.11"),
+    reason="interior-point solver is not available in SciPy 1.11",
+)
+@pytest.mark.filterwarnings("ignore:`method='interior-point'` is deprecated")
+def test_linprog_failure():
+    """Test that linprog fails."""
+    X = np.linspace(0, 10, num=10).reshape(-1, 1)
+    y = np.linspace(0, 10, num=10)
+    reg = QuantileRegressor(
+        alpha=0, solver="interior-point", solver_options={"maxiter": 1}
+    )
+
+    msg = "Linear programming for QuantileRegressor did not succeed."
+    with pytest.warns(ConvergenceWarning, match=msg):
+        reg.fit(X, y)
+
+
+@skip_if_32bit
+@pytest.mark.skipif(
+    sp_version <= parse_version("1.6.0"),
+    reason="Solvers are available as of scipy 1.6.0",
+)
+@pytest.mark.parametrize(
+    "sparse_container", CSC_CONTAINERS + CSR_CONTAINERS + COO_CONTAINERS
+)
+@pytest.mark.parametrize("solver", ["highs", "highs-ds", "highs-ipm"])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_sparse_input(sparse_container, solver, fit_intercept, default_solver):
+    """Test that sparse and dense X give same results."""
+    X, y = make_regression(n_samples=100, n_features=20, random_state=1, noise=1.0)
+    X_sparse = sparse_container(X)
+    alpha = 1e-4
+    quant_dense = QuantileRegressor(
+        alpha=alpha, fit_intercept=fit_intercept, solver=default_solver
+    ).fit(X, y)
+    quant_sparse = QuantileRegressor(
+        alpha=alpha, fit_intercept=fit_intercept, solver=solver
+    ).fit(X_sparse, y)
+    assert_allclose(quant_sparse.coef_, quant_dense.coef_, rtol=1e-2)
+    if fit_intercept:
+        assert quant_sparse.intercept_ == approx(quant_dense.intercept_)
+        # check that we still predict fraction
+        assert 0.45 <= np.mean(y < quant_sparse.predict(X_sparse)) <= 0.57
+
+
+def test_error_interior_point_future(X_y_data, monkeypatch):
+    """Check that we will raise a proper error when requesting
+    `solver='interior-point'` in SciPy >= 1.11.
+    """
+    X, y = X_y_data
+    import sklearn.linear_model._quantile
+
+    with monkeypatch.context() as m:
+        m.setattr(sklearn.linear_model._quantile, "sp_version", parse_version("1.11.0"))
+        err_msg = "Solver interior-point is not anymore available in SciPy >= 1.11.0."
+        with pytest.raises(ValueError, match=err_msg):
+            QuantileRegressor(solver="interior-point").fit(X, y)
@@ -0,0 +1,545 @@
+import numpy as np
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+from sklearn.datasets import make_regression
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import (
+    LinearRegression,
+    OrthogonalMatchingPursuit,
+    RANSACRegressor,
+    Ridge,
+)
+from sklearn.linear_model._ransac import _dynamic_max_trials
+from sklearn.utils import check_random_state
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
+
+# Generate coordinates of line
+X = np.arange(-200, 200)
+y = 0.2 * X + 20
+data = np.column_stack([X, y])
+
+# Add some faulty data
+rng = np.random.RandomState(1000)
+outliers = np.unique(rng.randint(len(X), size=200))
+data[outliers, :] += 50 + rng.rand(len(outliers), 2) * 10
+
+X = data[:, 0][:, np.newaxis]
+y = data[:, 1]
+
+
+def test_ransac_inliers_outliers():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+
+    # Estimate parameters of corrupted data
+    ransac_estimator.fit(X, y)
+
+    # Ground truth / reference inlier mask
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+
+def test_ransac_is_data_valid():
+    def is_data_valid(X, y):
+        assert X.shape[0] == 2
+        assert y.shape[0] == 2
+        return False
+
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 2)
+    y = rng.rand(10, 1)
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        is_data_valid=is_data_valid,
+        random_state=0,
+    )
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
+
+
+def test_ransac_is_model_valid():
+    def is_model_valid(estimator, X, y):
+        assert X.shape[0] == 2
+        assert y.shape[0] == 2
+        return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        is_model_valid=is_model_valid,
+        random_state=0,
+    )
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
+
+
+def test_ransac_max_trials():
+    estimator = LinearRegression()
+
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        max_trials=0,
+        random_state=0,
+    )
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
+
+    # there is a 1e-9 chance it will take these many trials. No good reason
+    # 1e-2 isn't enough, can still happen
+    # 2 is the what ransac defines  as min_samples = X.shape[1] + 1
+    max_trials = _dynamic_max_trials(len(X) - len(outliers), X.shape[0], 2, 1 - 1e-9)
+    ransac_estimator = RANSACRegressor(estimator, min_samples=2)
+    for i in range(50):
+        ransac_estimator.set_params(min_samples=2, random_state=i)
+        ransac_estimator.fit(X, y)
+        assert ransac_estimator.n_trials_ < max_trials + 1
+
+
+def test_ransac_stop_n_inliers():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        stop_n_inliers=2,
+        random_state=0,
+    )
+    ransac_estimator.fit(X, y)
+
+    assert ransac_estimator.n_trials_ == 1
+
+
+def test_ransac_stop_score():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        stop_score=0,
+        random_state=0,
+    )
+    ransac_estimator.fit(X, y)
+
+    assert ransac_estimator.n_trials_ == 1
+
+
+def test_ransac_score():
+    X = np.arange(100)[:, None]
+    y = np.zeros((100,))
+    y[0] = 1
+    y[1] = 100
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=0.5, random_state=0
+    )
+    ransac_estimator.fit(X, y)
+
+    assert ransac_estimator.score(X[2:], y[2:]) == 1
+    assert ransac_estimator.score(X[:2], y[:2]) < 1
+
+
+def test_ransac_predict():
+    X = np.arange(100)[:, None]
+    y = np.zeros((100,))
+    y[0] = 1
+    y[1] = 100
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=0.5, random_state=0
+    )
+    ransac_estimator.fit(X, y)
+
+    assert_array_equal(ransac_estimator.predict(X), np.zeros(100))
+
+
+def test_ransac_no_valid_data():
+    def is_data_valid(X, y):
+        return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, is_data_valid=is_data_valid, max_trials=5
+    )
+
+    msg = "RANSAC could not find a valid consensus set"
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
+    assert ransac_estimator.n_skips_no_inliers_ == 0
+    assert ransac_estimator.n_skips_invalid_data_ == 5
+    assert ransac_estimator.n_skips_invalid_model_ == 0
+
+
+def test_ransac_no_valid_model():
+    def is_model_valid(estimator, X, y):
+        return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, is_model_valid=is_model_valid, max_trials=5
+    )
+
+    msg = "RANSAC could not find a valid consensus set"
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
+    assert ransac_estimator.n_skips_no_inliers_ == 0
+    assert ransac_estimator.n_skips_invalid_data_ == 0
+    assert ransac_estimator.n_skips_invalid_model_ == 5
+
+
+def test_ransac_exceed_max_skips():
+    def is_data_valid(X, y):
+        return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, is_data_valid=is_data_valid, max_trials=5, max_skips=3
+    )
+
+    msg = "RANSAC skipped more iterations than `max_skips`"
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
+    assert ransac_estimator.n_skips_no_inliers_ == 0
+    assert ransac_estimator.n_skips_invalid_data_ == 4
+    assert ransac_estimator.n_skips_invalid_model_ == 0
+
+
+def test_ransac_warn_exceed_max_skips():
+    global cause_skip
+    cause_skip = False
+
+    def is_data_valid(X, y):
+        global cause_skip
+        if not cause_skip:
+            cause_skip = True
+            return True
+        else:
+            return False
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, is_data_valid=is_data_valid, max_skips=3, max_trials=5
+    )
+    warning_message = (
+        "RANSAC found a valid consensus set but exited "
+        "early due to skipping more iterations than "
+        "`max_skips`. See estimator attributes for "
+        "diagnostics."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        ransac_estimator.fit(X, y)
+    assert ransac_estimator.n_skips_no_inliers_ == 0
+    assert ransac_estimator.n_skips_invalid_data_ == 4
+    assert ransac_estimator.n_skips_invalid_model_ == 0
+
+
+@pytest.mark.parametrize(
+    "sparse_container", COO_CONTAINERS + CSR_CONTAINERS + CSC_CONTAINERS
+)
+def test_ransac_sparse(sparse_container):
+    X_sparse = sparse_container(X)
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_estimator.fit(X_sparse, y)
+
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+
+def test_ransac_none_estimator():
+    estimator = LinearRegression()
+
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_none_estimator = RANSACRegressor(
+        None, min_samples=2, residual_threshold=5, random_state=0
+    )
+
+    ransac_estimator.fit(X, y)
+    ransac_none_estimator.fit(X, y)
+
+    assert_array_almost_equal(
+        ransac_estimator.predict(X), ransac_none_estimator.predict(X)
+    )
+
+
+def test_ransac_min_n_samples():
+    estimator = LinearRegression()
+    ransac_estimator1 = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_estimator2 = RANSACRegressor(
+        estimator,
+        min_samples=2.0 / X.shape[0],
+        residual_threshold=5,
+        random_state=0,
+    )
+    ransac_estimator5 = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_estimator6 = RANSACRegressor(estimator, residual_threshold=5, random_state=0)
+    ransac_estimator7 = RANSACRegressor(
+        estimator, min_samples=X.shape[0] + 1, residual_threshold=5, random_state=0
+    )
+    # GH #19390
+    ransac_estimator8 = RANSACRegressor(
+        Ridge(), min_samples=None, residual_threshold=5, random_state=0
+    )
+
+    ransac_estimator1.fit(X, y)
+    ransac_estimator2.fit(X, y)
+    ransac_estimator5.fit(X, y)
+    ransac_estimator6.fit(X, y)
+
+    assert_array_almost_equal(
+        ransac_estimator1.predict(X), ransac_estimator2.predict(X)
+    )
+    assert_array_almost_equal(
+        ransac_estimator1.predict(X), ransac_estimator5.predict(X)
+    )
+    assert_array_almost_equal(
+        ransac_estimator1.predict(X), ransac_estimator6.predict(X)
+    )
+
+    with pytest.raises(ValueError):
+        ransac_estimator7.fit(X, y)
+
+    err_msg = "`min_samples` needs to be explicitly set"
+    with pytest.raises(ValueError, match=err_msg):
+        ransac_estimator8.fit(X, y)
+
+
+def test_ransac_multi_dimensional_targets():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+
+    # 3-D target values
+    yyy = np.column_stack([y, y, y])
+
+    # Estimate parameters of corrupted data
+    ransac_estimator.fit(X, yyy)
+
+    # Ground truth / reference inlier mask
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+
+def test_ransac_residual_loss():
+    def loss_multi1(y_true, y_pred):
+        return np.sum(np.abs(y_true - y_pred), axis=1)
+
+    def loss_multi2(y_true, y_pred):
+        return np.sum((y_true - y_pred) ** 2, axis=1)
+
+    def loss_mono(y_true, y_pred):
+        return np.abs(y_true - y_pred)
+
+    yyy = np.column_stack([y, y, y])
+
+    estimator = LinearRegression()
+    ransac_estimator0 = RANSACRegressor(
+        estimator, min_samples=2, residual_threshold=5, random_state=0
+    )
+    ransac_estimator1 = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        random_state=0,
+        loss=loss_multi1,
+    )
+    ransac_estimator2 = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        random_state=0,
+        loss=loss_multi2,
+    )
+
+    # multi-dimensional
+    ransac_estimator0.fit(X, yyy)
+    ransac_estimator1.fit(X, yyy)
+    ransac_estimator2.fit(X, yyy)
+    assert_array_almost_equal(
+        ransac_estimator0.predict(X), ransac_estimator1.predict(X)
+    )
+    assert_array_almost_equal(
+        ransac_estimator0.predict(X), ransac_estimator2.predict(X)
+    )
+
+    # one-dimensional
+    ransac_estimator0.fit(X, y)
+    ransac_estimator2.loss = loss_mono
+    ransac_estimator2.fit(X, y)
+    assert_array_almost_equal(
+        ransac_estimator0.predict(X), ransac_estimator2.predict(X)
+    )
+    ransac_estimator3 = RANSACRegressor(
+        estimator,
+        min_samples=2,
+        residual_threshold=5,
+        random_state=0,
+        loss="squared_error",
+    )
+    ransac_estimator3.fit(X, y)
+    assert_array_almost_equal(
+        ransac_estimator0.predict(X), ransac_estimator2.predict(X)
+    )
+
+
+def test_ransac_default_residual_threshold():
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(estimator, min_samples=2, random_state=0)
+
+    # Estimate parameters of corrupted data
+    ransac_estimator.fit(X, y)
+
+    # Ground truth / reference inlier mask
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+
+def test_ransac_dynamic_max_trials():
+    # Numbers hand-calculated and confirmed on page 119 (Table 4.3) in
+    #   Hartley, R.~I. and Zisserman, A., 2004,
+    #   Multiple View Geometry in Computer Vision, Second Edition,
+    #   Cambridge University Press, ISBN: 0521540518
+
+    # e = 0%, min_samples = X
+    assert _dynamic_max_trials(100, 100, 2, 0.99) == 1
+
+    # e = 5%, min_samples = 2
+    assert _dynamic_max_trials(95, 100, 2, 0.99) == 2
+    # e = 10%, min_samples = 2
+    assert _dynamic_max_trials(90, 100, 2, 0.99) == 3
+    # e = 30%, min_samples = 2
+    assert _dynamic_max_trials(70, 100, 2, 0.99) == 7
+    # e = 50%, min_samples = 2
+    assert _dynamic_max_trials(50, 100, 2, 0.99) == 17
+
+    # e = 5%, min_samples = 8
+    assert _dynamic_max_trials(95, 100, 8, 0.99) == 5
+    # e = 10%, min_samples = 8
+    assert _dynamic_max_trials(90, 100, 8, 0.99) == 9
+    # e = 30%, min_samples = 8
+    assert _dynamic_max_trials(70, 100, 8, 0.99) == 78
+    # e = 50%, min_samples = 8
+    assert _dynamic_max_trials(50, 100, 8, 0.99) == 1177
+
+    # e = 0%, min_samples = 10
+    assert _dynamic_max_trials(1, 100, 10, 0) == 0
+    assert _dynamic_max_trials(1, 100, 10, 1) == float("inf")
+
+
+def test_ransac_fit_sample_weight():
+    ransac_estimator = RANSACRegressor(random_state=0)
+    n_samples = y.shape[0]
+    weights = np.ones(n_samples)
+    ransac_estimator.fit(X, y, sample_weight=weights)
+    # sanity check
+    assert ransac_estimator.inlier_mask_.shape[0] == n_samples
+
+    ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
+    ref_inlier_mask[outliers] = False
+    # check that mask is correct
+    assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
+
+    # check that fit(X)  = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
+    #   X = X1 repeated n1 times, X2 repeated n2 times and so forth
+    random_state = check_random_state(0)
+    X_ = random_state.randint(0, 200, [10, 1])
+    y_ = np.ndarray.flatten(0.2 * X_ + 2)
+    sample_weight = random_state.randint(0, 10, 10)
+    outlier_X = random_state.randint(0, 1000, [1, 1])
+    outlier_weight = random_state.randint(0, 10, 1)
+    outlier_y = random_state.randint(-1000, 0, 1)
+
+    X_flat = np.append(
+        np.repeat(X_, sample_weight, axis=0),
+        np.repeat(outlier_X, outlier_weight, axis=0),
+        axis=0,
+    )
+    y_flat = np.ndarray.flatten(
+        np.append(
+            np.repeat(y_, sample_weight, axis=0),
+            np.repeat(outlier_y, outlier_weight, axis=0),
+            axis=0,
+        )
+    )
+    ransac_estimator.fit(X_flat, y_flat)
+    ref_coef_ = ransac_estimator.estimator_.coef_
+
+    sample_weight = np.append(sample_weight, outlier_weight)
+    X_ = np.append(X_, outlier_X, axis=0)
+    y_ = np.append(y_, outlier_y)
+    ransac_estimator.fit(X_, y_, sample_weight=sample_weight)
+
+    assert_allclose(ransac_estimator.estimator_.coef_, ref_coef_)
+
+    # check that if estimator.fit doesn't support
+    # sample_weight, raises error
+    estimator = OrthogonalMatchingPursuit()
+    ransac_estimator = RANSACRegressor(estimator, min_samples=10)
+
+    err_msg = f"{estimator.__class__.__name__} does not support sample_weight."
+    with pytest.raises(ValueError, match=err_msg):
+        ransac_estimator.fit(X, y, sample_weight=weights)
+
+
+def test_ransac_final_model_fit_sample_weight():
+    X, y = make_regression(n_samples=1000, random_state=10)
+    rng = check_random_state(42)
+    sample_weight = rng.randint(1, 4, size=y.shape[0])
+    sample_weight = sample_weight / sample_weight.sum()
+    ransac = RANSACRegressor(random_state=0)
+    ransac.fit(X, y, sample_weight=sample_weight)
+
+    final_model = LinearRegression()
+    mask_samples = ransac.inlier_mask_
+    final_model.fit(
+        X[mask_samples], y[mask_samples], sample_weight=sample_weight[mask_samples]
+    )
+
+    assert_allclose(ransac.estimator_.coef_, final_model.coef_, atol=1e-12)
+
+
+def test_perfect_horizontal_line():
+    """Check that we can fit a line where all samples are inliers.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19497
+    """
+    X = np.arange(100)[:, None]
+    y = np.zeros((100,))
+
+    estimator = LinearRegression()
+    ransac_estimator = RANSACRegressor(estimator, random_state=0)
+    ransac_estimator.fit(X, y)
+
+    assert_allclose(ransac_estimator.estimator_.coef_, 0.0)
+    assert_allclose(ransac_estimator.estimator_.intercept_, 0.0)
@@ -0,0 +1,947 @@
+# Authors: Danny Sullivan <dbsullivan23@gmail.com>
+#          Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
+#
+# License: BSD 3 clause
+
+import math
+import re
+
+import numpy as np
+import pytest
+from scipy.special import logsumexp
+
+from sklearn._loss.loss import HalfMultinomialLoss
+from sklearn.base import clone
+from sklearn.datasets import load_iris, make_blobs, make_classification
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.linear_model._base import make_dataset
+from sklearn.linear_model._linear_loss import LinearModelLoss
+from sklearn.linear_model._sag import get_auto_step_size
+from sklearn.linear_model._sag_fast import _multinomial_grad_loss_all_samples
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.preprocessing import LabelBinarizer, LabelEncoder
+from sklearn.utils import check_random_state, compute_class_weight
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+)
+from sklearn.utils.extmath import row_norms
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+iris = load_iris()
+
+
+# this is used for sag classification
+def log_dloss(p, y):
+    z = p * y
+    # approximately equal and saves the computation of the log
+    if z > 18.0:
+        return math.exp(-z) * -y
+    if z < -18.0:
+        return -y
+    return -y / (math.exp(z) + 1.0)
+
+
+def log_loss(p, y):
+    return np.mean(np.log(1.0 + np.exp(-y * p)))
+
+
+# this is used for sag regression
+def squared_dloss(p, y):
+    return p - y
+
+
+def squared_loss(p, y):
+    return np.mean(0.5 * (p - y) * (p - y))
+
+
+# function for measuring the log loss
+def get_pobj(w, alpha, myX, myy, loss):
+    w = w.ravel()
+    pred = np.dot(myX, w)
+    p = loss(pred, myy)
+    p += alpha * w.dot(w) / 2.0
+    return p
+
+
+def sag(
+    X,
+    y,
+    step_size,
+    alpha,
+    n_iter=1,
+    dloss=None,
+    sparse=False,
+    sample_weight=None,
+    fit_intercept=True,
+    saga=False,
+):
+    n_samples, n_features = X.shape[0], X.shape[1]
+
+    weights = np.zeros(X.shape[1])
+    sum_gradient = np.zeros(X.shape[1])
+    gradient_memory = np.zeros((n_samples, n_features))
+
+    intercept = 0.0
+    intercept_sum_gradient = 0.0
+    intercept_gradient_memory = np.zeros(n_samples)
+
+    rng = np.random.RandomState(77)
+    decay = 1.0
+    seen = set()
+
+    # sparse data has a fixed decay of .01
+    if sparse:
+        decay = 0.01
+
+    for epoch in range(n_iter):
+        for k in range(n_samples):
+            idx = int(rng.rand() * n_samples)
+            # idx = k
+            entry = X[idx]
+            seen.add(idx)
+            p = np.dot(entry, weights) + intercept
+            gradient = dloss(p, y[idx])
+            if sample_weight is not None:
+                gradient *= sample_weight[idx]
+            update = entry * gradient + alpha * weights
+            gradient_correction = update - gradient_memory[idx]
+            sum_gradient += gradient_correction
+            gradient_memory[idx] = update
+            if saga:
+                weights -= gradient_correction * step_size * (1 - 1.0 / len(seen))
+
+            if fit_intercept:
+                gradient_correction = gradient - intercept_gradient_memory[idx]
+                intercept_gradient_memory[idx] = gradient
+                intercept_sum_gradient += gradient_correction
+                gradient_correction *= step_size * (1.0 - 1.0 / len(seen))
+                if saga:
+                    intercept -= (
+                        step_size * intercept_sum_gradient / len(seen) * decay
+                    ) + gradient_correction
+                else:
+                    intercept -= step_size * intercept_sum_gradient / len(seen) * decay
+
+            weights -= step_size * sum_gradient / len(seen)
+
+    return weights, intercept
+
+
+def sag_sparse(
+    X,
+    y,
+    step_size,
+    alpha,
+    n_iter=1,
+    dloss=None,
+    sample_weight=None,
+    sparse=False,
+    fit_intercept=True,
+    saga=False,
+    random_state=0,
+):
+    if step_size * alpha == 1.0:
+        raise ZeroDivisionError(
+            "Sparse sag does not handle the case step_size * alpha == 1"
+        )
+    n_samples, n_features = X.shape[0], X.shape[1]
+
+    weights = np.zeros(n_features)
+    sum_gradient = np.zeros(n_features)
+    last_updated = np.zeros(n_features, dtype=int)
+    gradient_memory = np.zeros(n_samples)
+    rng = check_random_state(random_state)
+    intercept = 0.0
+    intercept_sum_gradient = 0.0
+    wscale = 1.0
+    decay = 1.0
+    seen = set()
+
+    c_sum = np.zeros(n_iter * n_samples)
+
+    # sparse data has a fixed decay of .01
+    if sparse:
+        decay = 0.01
+
+    counter = 0
+    for epoch in range(n_iter):
+        for k in range(n_samples):
+            # idx = k
+            idx = int(rng.rand() * n_samples)
+            entry = X[idx]
+            seen.add(idx)
+
+            if counter >= 1:
+                for j in range(n_features):
+                    if last_updated[j] == 0:
+                        weights[j] -= c_sum[counter - 1] * sum_gradient[j]
+                    else:
+                        weights[j] -= (
+                            c_sum[counter - 1] - c_sum[last_updated[j] - 1]
+                        ) * sum_gradient[j]
+                    last_updated[j] = counter
+
+            p = (wscale * np.dot(entry, weights)) + intercept
+            gradient = dloss(p, y[idx])
+
+            if sample_weight is not None:
+                gradient *= sample_weight[idx]
+
+            update = entry * gradient
+            gradient_correction = update - (gradient_memory[idx] * entry)
+            sum_gradient += gradient_correction
+            if saga:
+                for j in range(n_features):
+                    weights[j] -= (
+                        gradient_correction[j]
+                        * step_size
+                        * (1 - 1.0 / len(seen))
+                        / wscale
+                    )
+
+            if fit_intercept:
+                gradient_correction = gradient - gradient_memory[idx]
+                intercept_sum_gradient += gradient_correction
+                gradient_correction *= step_size * (1.0 - 1.0 / len(seen))
+                if saga:
+                    intercept -= (
+                        step_size * intercept_sum_gradient / len(seen) * decay
+                    ) + gradient_correction
+                else:
+                    intercept -= step_size * intercept_sum_gradient / len(seen) * decay
+
+            gradient_memory[idx] = gradient
+
+            wscale *= 1.0 - alpha * step_size
+            if counter == 0:
+                c_sum[0] = step_size / (wscale * len(seen))
+            else:
+                c_sum[counter] = c_sum[counter - 1] + step_size / (wscale * len(seen))
+
+            if counter >= 1 and wscale < 1e-9:
+                for j in range(n_features):
+                    if last_updated[j] == 0:
+                        weights[j] -= c_sum[counter] * sum_gradient[j]
+                    else:
+                        weights[j] -= (
+                            c_sum[counter] - c_sum[last_updated[j] - 1]
+                        ) * sum_gradient[j]
+                    last_updated[j] = counter + 1
+                c_sum[counter] = 0
+                weights *= wscale
+                wscale = 1.0
+
+            counter += 1
+
+    for j in range(n_features):
+        if last_updated[j] == 0:
+            weights[j] -= c_sum[counter - 1] * sum_gradient[j]
+        else:
+            weights[j] -= (
+                c_sum[counter - 1] - c_sum[last_updated[j] - 1]
+            ) * sum_gradient[j]
+    weights *= wscale
+    return weights, intercept
+
+
+def get_step_size(X, alpha, fit_intercept, classification=True):
+    if classification:
+        return 4.0 / (np.max(np.sum(X * X, axis=1)) + fit_intercept + 4.0 * alpha)
+    else:
+        return 1.0 / (np.max(np.sum(X * X, axis=1)) + fit_intercept + alpha)
+
+
+def test_classifier_matching():
+    n_samples = 20
+    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1)
+    y[y == 0] = -1
+    alpha = 1.1
+    fit_intercept = True
+    step_size = get_step_size(X, alpha, fit_intercept)
+    for solver in ["sag", "saga"]:
+        if solver == "sag":
+            n_iter = 80
+        else:
+            # SAGA variance w.r.t. stream order is higher
+            n_iter = 300
+        clf = LogisticRegression(
+            solver=solver,
+            fit_intercept=fit_intercept,
+            tol=1e-11,
+            C=1.0 / alpha / n_samples,
+            max_iter=n_iter,
+            random_state=10,
+        )
+        clf.fit(X, y)
+
+        weights, intercept = sag_sparse(
+            X,
+            y,
+            step_size,
+            alpha,
+            n_iter=n_iter,
+            dloss=log_dloss,
+            fit_intercept=fit_intercept,
+            saga=solver == "saga",
+        )
+        weights2, intercept2 = sag(
+            X,
+            y,
+            step_size,
+            alpha,
+            n_iter=n_iter,
+            dloss=log_dloss,
+            fit_intercept=fit_intercept,
+            saga=solver == "saga",
+        )
+        weights = np.atleast_2d(weights)
+        intercept = np.atleast_1d(intercept)
+        weights2 = np.atleast_2d(weights2)
+        intercept2 = np.atleast_1d(intercept2)
+
+        assert_array_almost_equal(weights, clf.coef_, decimal=9)
+        assert_array_almost_equal(intercept, clf.intercept_, decimal=9)
+        assert_array_almost_equal(weights2, clf.coef_, decimal=9)
+        assert_array_almost_equal(intercept2, clf.intercept_, decimal=9)
+
+
+def test_regressor_matching():
+    n_samples = 10
+    n_features = 5
+
+    rng = np.random.RandomState(10)
+    X = rng.normal(size=(n_samples, n_features))
+    true_w = rng.normal(size=n_features)
+    y = X.dot(true_w)
+
+    alpha = 1.0
+    n_iter = 100
+    fit_intercept = True
+
+    step_size = get_step_size(X, alpha, fit_intercept, classification=False)
+    clf = Ridge(
+        fit_intercept=fit_intercept,
+        tol=0.00000000001,
+        solver="sag",
+        alpha=alpha * n_samples,
+        max_iter=n_iter,
+    )
+    clf.fit(X, y)
+
+    weights1, intercept1 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=squared_dloss,
+        fit_intercept=fit_intercept,
+    )
+    weights2, intercept2 = sag(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=squared_dloss,
+        fit_intercept=fit_intercept,
+    )
+
+    assert_allclose(weights1, clf.coef_)
+    assert_allclose(intercept1, clf.intercept_)
+    assert_allclose(weights2, clf.coef_)
+    assert_allclose(intercept2, clf.intercept_)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_pobj_matches_logistic_regression(csr_container):
+    """tests if the sag pobj matches log reg"""
+    n_samples = 100
+    alpha = 1.0
+    max_iter = 20
+    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1)
+
+    clf1 = LogisticRegression(
+        solver="sag",
+        fit_intercept=False,
+        tol=0.0000001,
+        C=1.0 / alpha / n_samples,
+        max_iter=max_iter,
+        random_state=10,
+    )
+    clf2 = clone(clf1)
+    clf3 = LogisticRegression(
+        fit_intercept=False,
+        tol=0.0000001,
+        C=1.0 / alpha / n_samples,
+        max_iter=max_iter,
+        random_state=10,
+    )
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    clf3.fit(X, y)
+
+    pobj1 = get_pobj(clf1.coef_, alpha, X, y, log_loss)
+    pobj2 = get_pobj(clf2.coef_, alpha, X, y, log_loss)
+    pobj3 = get_pobj(clf3.coef_, alpha, X, y, log_loss)
+
+    assert_array_almost_equal(pobj1, pobj2, decimal=4)
+    assert_array_almost_equal(pobj2, pobj3, decimal=4)
+    assert_array_almost_equal(pobj3, pobj1, decimal=4)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_pobj_matches_ridge_regression(csr_container):
+    """tests if the sag pobj matches ridge reg"""
+    n_samples = 100
+    n_features = 10
+    alpha = 1.0
+    n_iter = 100
+    fit_intercept = False
+    rng = np.random.RandomState(10)
+    X = rng.normal(size=(n_samples, n_features))
+    true_w = rng.normal(size=n_features)
+    y = X.dot(true_w)
+
+    clf1 = Ridge(
+        fit_intercept=fit_intercept,
+        tol=0.00000000001,
+        solver="sag",
+        alpha=alpha,
+        max_iter=n_iter,
+        random_state=42,
+    )
+    clf2 = clone(clf1)
+    clf3 = Ridge(
+        fit_intercept=fit_intercept,
+        tol=0.00001,
+        solver="lsqr",
+        alpha=alpha,
+        max_iter=n_iter,
+        random_state=42,
+    )
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    clf3.fit(X, y)
+
+    pobj1 = get_pobj(clf1.coef_, alpha, X, y, squared_loss)
+    pobj2 = get_pobj(clf2.coef_, alpha, X, y, squared_loss)
+    pobj3 = get_pobj(clf3.coef_, alpha, X, y, squared_loss)
+
+    assert_array_almost_equal(pobj1, pobj2, decimal=4)
+    assert_array_almost_equal(pobj1, pobj3, decimal=4)
+    assert_array_almost_equal(pobj3, pobj2, decimal=4)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_regressor_computed_correctly(csr_container):
+    """tests if the sag regressor is computed correctly"""
+    alpha = 0.1
+    n_features = 10
+    n_samples = 40
+    max_iter = 100
+    tol = 0.000001
+    fit_intercept = True
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+    y = np.dot(X, w) + 2.0
+    step_size = get_step_size(X, alpha, fit_intercept, classification=False)
+
+    clf1 = Ridge(
+        fit_intercept=fit_intercept,
+        tol=tol,
+        solver="sag",
+        alpha=alpha * n_samples,
+        max_iter=max_iter,
+        random_state=rng,
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+
+    spweights1, spintercept1 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=max_iter,
+        dloss=squared_dloss,
+        fit_intercept=fit_intercept,
+        random_state=rng,
+    )
+
+    spweights2, spintercept2 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=max_iter,
+        dloss=squared_dloss,
+        sparse=True,
+        fit_intercept=fit_intercept,
+        random_state=rng,
+    )
+
+    assert_array_almost_equal(clf1.coef_.ravel(), spweights1.ravel(), decimal=3)
+    assert_almost_equal(clf1.intercept_, spintercept1, decimal=1)
+
+    # TODO: uncomment when sparse Ridge with intercept will be fixed (#4710)
+    # assert_array_almost_equal(clf2.coef_.ravel(),
+    #                          spweights2.ravel(),
+    #                          decimal=3)
+    # assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)'''
+
+
+def test_get_auto_step_size():
+    X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64)
+    alpha = 1.2
+    fit_intercept = False
+    # sum the squares of the second sample because that's the largest
+    max_squared_sum = 4 + 9 + 16
+    max_squared_sum_ = row_norms(X, squared=True).max()
+    n_samples = X.shape[0]
+    assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4)
+
+    for saga in [True, False]:
+        for fit_intercept in (True, False):
+            if saga:
+                L_sqr = max_squared_sum + alpha + int(fit_intercept)
+                L_log = (max_squared_sum + 4.0 * alpha + int(fit_intercept)) / 4.0
+                mun_sqr = min(2 * n_samples * alpha, L_sqr)
+                mun_log = min(2 * n_samples * alpha, L_log)
+                step_size_sqr = 1 / (2 * L_sqr + mun_sqr)
+                step_size_log = 1 / (2 * L_log + mun_log)
+            else:
+                step_size_sqr = 1.0 / (max_squared_sum + alpha + int(fit_intercept))
+                step_size_log = 4.0 / (
+                    max_squared_sum + 4.0 * alpha + int(fit_intercept)
+                )
+
+            step_size_sqr_ = get_auto_step_size(
+                max_squared_sum_,
+                alpha,
+                "squared",
+                fit_intercept,
+                n_samples=n_samples,
+                is_saga=saga,
+            )
+            step_size_log_ = get_auto_step_size(
+                max_squared_sum_,
+                alpha,
+                "log",
+                fit_intercept,
+                n_samples=n_samples,
+                is_saga=saga,
+            )
+
+            assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4)
+            assert_almost_equal(step_size_log, step_size_log_, decimal=4)
+
+    msg = "Unknown loss function for SAG solver, got wrong instead of"
+    with pytest.raises(ValueError, match=msg):
+        get_auto_step_size(max_squared_sum_, alpha, "wrong", fit_intercept)
+
+
+@pytest.mark.parametrize("seed", range(3))  # locally tested with 1000 seeds
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_regressor(seed, csr_container):
+    """tests if the sag regressor performs well"""
+    xmin, xmax = -5, 5
+    n_samples = 300
+    tol = 0.001
+    max_iter = 100
+    alpha = 0.1
+    rng = np.random.RandomState(seed)
+    X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
+
+    # simple linear function without noise
+    y = 0.5 * X.ravel()
+
+    clf1 = Ridge(
+        tol=tol,
+        solver="sag",
+        max_iter=max_iter,
+        alpha=alpha * n_samples,
+        random_state=rng,
+    )
+    clf2 = clone(clf1)
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    score1 = clf1.score(X, y)
+    score2 = clf2.score(X, y)
+    assert score1 > 0.98
+    assert score2 > 0.98
+
+    # simple linear function with noise
+    y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
+
+    clf1 = Ridge(tol=tol, solver="sag", max_iter=max_iter, alpha=alpha * n_samples)
+    clf2 = clone(clf1)
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    score1 = clf1.score(X, y)
+    score2 = clf2.score(X, y)
+    assert score1 > 0.45
+    assert score2 > 0.45
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_classifier_computed_correctly(csr_container):
+    """tests if the binary classifier is computed correctly"""
+    alpha = 0.1
+    n_samples = 50
+    n_iter = 50
+    tol = 0.00001
+    fit_intercept = True
+    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1)
+    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
+    classes = np.unique(y)
+    y_tmp = np.ones(n_samples)
+    y_tmp[y != classes[1]] = -1
+    y = y_tmp
+
+    clf1 = LogisticRegression(
+        solver="sag",
+        C=1.0 / alpha / n_samples,
+        max_iter=n_iter,
+        tol=tol,
+        random_state=77,
+        fit_intercept=fit_intercept,
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+
+    spweights, spintercept = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=log_dloss,
+        fit_intercept=fit_intercept,
+    )
+    spweights2, spintercept2 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=log_dloss,
+        sparse=True,
+        fit_intercept=fit_intercept,
+    )
+
+    assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2)
+    assert_almost_equal(clf1.intercept_, spintercept, decimal=1)
+
+    assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2)
+    assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sag_multiclass_computed_correctly(csr_container):
+    """tests if the multiclass classifier is computed correctly"""
+    alpha = 0.1
+    n_samples = 20
+    tol = 1e-5
+    max_iter = 70
+    fit_intercept = True
+    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1)
+    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
+    classes = np.unique(y)
+
+    clf1 = OneVsRestClassifier(
+        LogisticRegression(
+            solver="sag",
+            C=1.0 / alpha / n_samples,
+            max_iter=max_iter,
+            tol=tol,
+            random_state=77,
+            fit_intercept=fit_intercept,
+        )
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+
+    coef1 = []
+    intercept1 = []
+    coef2 = []
+    intercept2 = []
+    for cl in classes:
+        y_encoded = np.ones(n_samples)
+        y_encoded[y != cl] = -1
+
+        spweights1, spintercept1 = sag_sparse(
+            X,
+            y_encoded,
+            step_size,
+            alpha,
+            dloss=log_dloss,
+            n_iter=max_iter,
+            fit_intercept=fit_intercept,
+        )
+        spweights2, spintercept2 = sag_sparse(
+            X,
+            y_encoded,
+            step_size,
+            alpha,
+            dloss=log_dloss,
+            n_iter=max_iter,
+            sparse=True,
+            fit_intercept=fit_intercept,
+        )
+        coef1.append(spweights1)
+        intercept1.append(spintercept1)
+
+        coef2.append(spweights2)
+        intercept2.append(spintercept2)
+
+    coef1 = np.vstack(coef1)
+    intercept1 = np.array(intercept1)
+    coef2 = np.vstack(coef2)
+    intercept2 = np.array(intercept2)
+
+    for i, cl in enumerate(classes):
+        assert_allclose(clf1.estimators_[i].coef_.ravel(), coef1[i], rtol=1e-2)
+        assert_allclose(clf1.estimators_[i].intercept_, intercept1[i], rtol=1e-1)
+
+        assert_allclose(clf2.estimators_[i].coef_.ravel(), coef2[i], rtol=1e-2)
+        # Note the very crude accuracy, i.e. high rtol.
+        assert_allclose(clf2.estimators_[i].intercept_, intercept2[i], rtol=5e-1)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_classifier_results(csr_container):
+    """tests if classifier results match target"""
+    alpha = 0.1
+    n_features = 20
+    n_samples = 10
+    tol = 0.01
+    max_iter = 200
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+    w = rng.normal(size=n_features)
+    y = np.dot(X, w)
+    y = np.sign(y)
+    clf1 = LogisticRegression(
+        solver="sag",
+        C=1.0 / alpha / n_samples,
+        max_iter=max_iter,
+        tol=tol,
+        random_state=77,
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+    pred1 = clf1.predict(X)
+    pred2 = clf2.predict(X)
+    assert_almost_equal(pred1, y, decimal=12)
+    assert_almost_equal(pred2, y, decimal=12)
+
+
+@pytest.mark.filterwarnings("ignore:The max_iter was reached")
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_binary_classifier_class_weight(csr_container):
+    """tests binary classifier with classweights for each class"""
+    alpha = 0.1
+    n_samples = 50
+    n_iter = 20
+    tol = 0.00001
+    fit_intercept = True
+    X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10, cluster_std=0.1)
+    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
+    classes = np.unique(y)
+    y_tmp = np.ones(n_samples)
+    y_tmp[y != classes[1]] = -1
+    y = y_tmp
+
+    class_weight = {1: 0.45, -1: 0.55}
+    clf1 = LogisticRegression(
+        solver="sag",
+        C=1.0 / alpha / n_samples,
+        max_iter=n_iter,
+        tol=tol,
+        random_state=77,
+        fit_intercept=fit_intercept,
+        class_weight=class_weight,
+    )
+    clf2 = clone(clf1)
+
+    clf1.fit(X, y)
+    clf2.fit(csr_container(X), y)
+
+    le = LabelEncoder()
+    class_weight_ = compute_class_weight(class_weight, classes=np.unique(y), y=y)
+    sample_weight = class_weight_[le.fit_transform(y)]
+    spweights, spintercept = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=log_dloss,
+        sample_weight=sample_weight,
+        fit_intercept=fit_intercept,
+    )
+    spweights2, spintercept2 = sag_sparse(
+        X,
+        y,
+        step_size,
+        alpha,
+        n_iter=n_iter,
+        dloss=log_dloss,
+        sparse=True,
+        sample_weight=sample_weight,
+        fit_intercept=fit_intercept,
+    )
+
+    assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2)
+    assert_almost_equal(clf1.intercept_, spintercept, decimal=1)
+
+    assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2)
+    assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
+
+
+def test_classifier_single_class():
+    """tests if ValueError is thrown with only one class"""
+    X = [[1, 2], [3, 4]]
+    y = [1, 1]
+
+    msg = "This solver needs samples of at least 2 classes in the data"
+    with pytest.raises(ValueError, match=msg):
+        LogisticRegression(solver="sag").fit(X, y)
+
+
+def test_step_size_alpha_error():
+    X = [[0, 0], [0, 0]]
+    y = [1, -1]
+    fit_intercept = False
+    alpha = 1.0
+    msg = re.escape(
+        "Current sag implementation does not handle the case"
+        " step_size * alpha_scaled == 1"
+    )
+
+    clf1 = LogisticRegression(solver="sag", C=1.0 / alpha, fit_intercept=fit_intercept)
+    with pytest.raises(ZeroDivisionError, match=msg):
+        clf1.fit(X, y)
+
+    clf2 = Ridge(fit_intercept=fit_intercept, solver="sag", alpha=alpha)
+    with pytest.raises(ZeroDivisionError, match=msg):
+        clf2.fit(X, y)
+
+
+def test_multinomial_loss():
+    # test if the multinomial loss and gradient computations are consistent
+    X, y = iris.data, iris.target.astype(np.float64)
+    n_samples, n_features = X.shape
+    n_classes = len(np.unique(y))
+
+    rng = check_random_state(42)
+    weights = rng.randn(n_features, n_classes)
+    intercept = rng.randn(n_classes)
+    sample_weights = np.abs(rng.randn(n_samples))
+
+    # compute loss and gradient like in multinomial SAG
+    dataset, _ = make_dataset(X, y, sample_weights, random_state=42)
+    loss_1, grad_1 = _multinomial_grad_loss_all_samples(
+        dataset, weights, intercept, n_samples, n_features, n_classes
+    )
+    # compute loss and gradient like in multinomial LogisticRegression
+    loss = LinearModelLoss(
+        base_loss=HalfMultinomialLoss(n_classes=n_classes),
+        fit_intercept=True,
+    )
+    weights_intercept = np.vstack((weights, intercept)).T
+    loss_2, grad_2 = loss.loss_gradient(
+        weights_intercept, X, y, l2_reg_strength=0.0, sample_weight=sample_weights
+    )
+    grad_2 = grad_2[:, :-1].T
+    # convert to same convention, i.e. LinearModelLoss uses average(loss, weight=sw)
+    loss_2 *= np.sum(sample_weights)
+    grad_2 *= np.sum(sample_weights)
+
+    # comparison
+    assert_array_almost_equal(grad_1, grad_2)
+    assert_almost_equal(loss_1, loss_2)
+
+
+def test_multinomial_loss_ground_truth():
+    # n_samples, n_features, n_classes = 4, 2, 3
+    n_classes = 3
+    X = np.array([[1.1, 2.2], [2.2, -4.4], [3.3, -2.2], [1.1, 1.1]])
+    y = np.array([0, 1, 2, 0], dtype=np.float64)
+    lbin = LabelBinarizer()
+    Y_bin = lbin.fit_transform(y)
+
+    weights = np.array([[0.1, 0.2, 0.3], [1.1, 1.2, -1.3]])
+    intercept = np.array([1.0, 0, -0.2])
+    sample_weights = np.array([0.8, 1, 1, 0.8])
+
+    prediction = np.dot(X, weights) + intercept
+    logsumexp_prediction = logsumexp(prediction, axis=1)
+    p = prediction - logsumexp_prediction[:, np.newaxis]
+    loss_1 = -(sample_weights[:, np.newaxis] * p * Y_bin).sum()
+    diff = sample_weights[:, np.newaxis] * (np.exp(p) - Y_bin)
+    grad_1 = np.dot(X.T, diff)
+
+    loss = LinearModelLoss(
+        base_loss=HalfMultinomialLoss(n_classes=n_classes),
+        fit_intercept=True,
+    )
+    weights_intercept = np.vstack((weights, intercept)).T
+    loss_2, grad_2 = loss.loss_gradient(
+        weights_intercept, X, y, l2_reg_strength=0.0, sample_weight=sample_weights
+    )
+    grad_2 = grad_2[:, :-1].T
+    # convert to same convention, i.e. LinearModelLoss uses average(loss, weight=sw)
+    loss_2 *= np.sum(sample_weights)
+    grad_2 *= np.sum(sample_weights)
+
+    assert_almost_equal(loss_1, loss_2)
+    assert_array_almost_equal(grad_1, grad_2)
+
+    # ground truth
+    loss_gt = 11.680360354325961
+    grad_gt = np.array(
+        [[-0.557487, -1.619151, +2.176638], [-0.903942, +5.258745, -4.354803]]
+    )
+    assert_almost_equal(loss_1, loss_gt)
+    assert_array_almost_equal(grad_1, grad_gt)
+
+
+@pytest.mark.parametrize("solver", ["sag", "saga"])
+def test_sag_classifier_raises_error(solver):
+    # Following #13316, the error handling behavior changed in cython sag. This
+    # is simply a non-regression test to make sure numerical errors are
+    # properly raised.
+
+    # Train a classifier on a simple problem
+    rng = np.random.RandomState(42)
+    X, y = make_classification(random_state=rng)
+    clf = LogisticRegression(solver=solver, random_state=rng, warm_start=True)
+    clf.fit(X, y)
+
+    # Trigger a numerical error by:
+    # - corrupting the fitted coefficients of the classifier
+    # - fit it again starting from its current state thanks to warm_start
+    clf.coef_[:] = np.nan
+
+    with pytest.raises(ValueError, match="Floating-point under-/overflow"):
+        clf.fit(X, y)
@@ -0,0 +1,384 @@
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from numpy.testing import assert_allclose
+
+from sklearn.datasets import make_regression
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    assert_array_almost_equal,
+    create_memmap_backed_data,
+    ignore_warnings,
+)
+from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, LIL_CONTAINERS
+
+
+def test_sparse_coef():
+    # Check that the sparse_coef property works
+    clf = ElasticNet()
+    clf.coef_ = [1, 2, 3]
+
+    assert sp.issparse(clf.sparse_coef_)
+    assert clf.sparse_coef_.toarray().tolist()[0] == clf.coef_
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_lasso_zero(csc_container):
+    # Check that the sparse lasso can handle zero data without crashing
+    X = csc_container((3, 1))
+    y = [0, 0, 0]
+    T = np.array([[1], [2], [3]])
+    clf = Lasso().fit(X, y)
+    pred = clf.predict(T)
+    assert_array_almost_equal(clf.coef_, [0])
+    assert_array_almost_equal(pred, [0, 0, 0])
+    assert_almost_equal(clf.dual_gap_, 0)
+
+
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_enet_toy_list_input(with_sample_weight, csc_container):
+    # Test ElasticNet for various values of alpha and l1_ratio with list X
+
+    X = np.array([[-1], [0], [1]])
+    X = csc_container(X)
+    Y = [-1, 0, 1]  # just a straight line
+    T = np.array([[2], [3], [4]])  # test sample
+    if with_sample_weight:
+        sw = np.array([2.0, 2, 2])
+    else:
+        sw = None
+
+    # this should be the same as unregularized least squares
+    clf = ElasticNet(alpha=0, l1_ratio=1.0)
+    # catch warning about alpha=0.
+    # this is discouraged but should work.
+    ignore_warnings(clf.fit)(X, Y, sample_weight=sw)
+    pred = clf.predict(T)
+    assert_array_almost_equal(clf.coef_, [1])
+    assert_array_almost_equal(pred, [2, 3, 4])
+    assert_almost_equal(clf.dual_gap_, 0)
+
+    clf = ElasticNet(alpha=0.5, l1_ratio=0.3)
+    clf.fit(X, Y, sample_weight=sw)
+    pred = clf.predict(T)
+    assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
+    assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
+    assert_almost_equal(clf.dual_gap_, 0)
+
+    clf = ElasticNet(alpha=0.5, l1_ratio=0.5)
+    clf.fit(X, Y, sample_weight=sw)
+    pred = clf.predict(T)
+    assert_array_almost_equal(clf.coef_, [0.45454], 3)
+    assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3)
+    assert_almost_equal(clf.dual_gap_, 0)
+
+
+@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
+def test_enet_toy_explicit_sparse_input(lil_container):
+    # Test ElasticNet for various values of alpha and l1_ratio with sparse X
+    f = ignore_warnings
+    # training samples
+    X = lil_container((3, 1))
+    X[0, 0] = -1
+    # X[1, 0] = 0
+    X[2, 0] = 1
+    Y = [-1, 0, 1]  # just a straight line (the identity function)
+
+    # test samples
+    T = lil_container((3, 1))
+    T[0, 0] = 2
+    T[1, 0] = 3
+    T[2, 0] = 4
+
+    # this should be the same as lasso
+    clf = ElasticNet(alpha=0, l1_ratio=1.0)
+    f(clf.fit)(X, Y)
+    pred = clf.predict(T)
+    assert_array_almost_equal(clf.coef_, [1])
+    assert_array_almost_equal(pred, [2, 3, 4])
+    assert_almost_equal(clf.dual_gap_, 0)
+
+    clf = ElasticNet(alpha=0.5, l1_ratio=0.3)
+    clf.fit(X, Y)
+    pred = clf.predict(T)
+    assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
+    assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
+    assert_almost_equal(clf.dual_gap_, 0)
+
+    clf = ElasticNet(alpha=0.5, l1_ratio=0.5)
+    clf.fit(X, Y)
+    pred = clf.predict(T)
+    assert_array_almost_equal(clf.coef_, [0.45454], 3)
+    assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3)
+    assert_almost_equal(clf.dual_gap_, 0)
+
+
+def make_sparse_data(
+    sparse_container,
+    n_samples=100,
+    n_features=100,
+    n_informative=10,
+    seed=42,
+    positive=False,
+    n_targets=1,
+):
+    random_state = np.random.RandomState(seed)
+
+    # build an ill-posed linear regression problem with many noisy features and
+    # comparatively few samples
+
+    # generate a ground truth model
+    w = random_state.randn(n_features, n_targets)
+    w[n_informative:] = 0.0  # only the top features are impacting the model
+    if positive:
+        w = np.abs(w)
+
+    X = random_state.randn(n_samples, n_features)
+    rnd = random_state.uniform(size=(n_samples, n_features))
+    X[rnd > 0.5] = 0.0  # 50% of zeros in input signal
+
+    # generate training ground truth labels
+    y = np.dot(X, w)
+    X = sparse_container(X)
+    if n_targets == 1:
+        y = np.ravel(y)
+    return X, y
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+@pytest.mark.parametrize(
+    "alpha, fit_intercept, positive",
+    [(0.1, False, False), (0.1, True, False), (1e-3, False, True), (1e-3, True, True)],
+)
+def test_sparse_enet_not_as_toy_dataset(csc_container, alpha, fit_intercept, positive):
+    n_samples, n_features, max_iter = 100, 100, 1000
+    n_informative = 10
+
+    X, y = make_sparse_data(
+        csc_container, n_samples, n_features, n_informative, positive=positive
+    )
+
+    X_train, X_test = X[n_samples // 2 :], X[: n_samples // 2]
+    y_train, y_test = y[n_samples // 2 :], y[: n_samples // 2]
+
+    s_clf = ElasticNet(
+        alpha=alpha,
+        l1_ratio=0.8,
+        fit_intercept=fit_intercept,
+        max_iter=max_iter,
+        tol=1e-7,
+        positive=positive,
+        warm_start=True,
+    )
+    s_clf.fit(X_train, y_train)
+
+    assert_almost_equal(s_clf.dual_gap_, 0, 4)
+    assert s_clf.score(X_test, y_test) > 0.85
+
+    # check the convergence is the same as the dense version
+    d_clf = ElasticNet(
+        alpha=alpha,
+        l1_ratio=0.8,
+        fit_intercept=fit_intercept,
+        max_iter=max_iter,
+        tol=1e-7,
+        positive=positive,
+        warm_start=True,
+    )
+    d_clf.fit(X_train.toarray(), y_train)
+
+    assert_almost_equal(d_clf.dual_gap_, 0, 4)
+    assert d_clf.score(X_test, y_test) > 0.85
+
+    assert_almost_equal(s_clf.coef_, d_clf.coef_, 5)
+    assert_almost_equal(s_clf.intercept_, d_clf.intercept_, 5)
+
+    # check that the coefs are sparse
+    assert np.sum(s_clf.coef_ != 0.0) < 2 * n_informative
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_sparse_lasso_not_as_toy_dataset(csc_container):
+    n_samples = 100
+    max_iter = 1000
+    n_informative = 10
+    X, y = make_sparse_data(
+        csc_container, n_samples=n_samples, n_informative=n_informative
+    )
+
+    X_train, X_test = X[n_samples // 2 :], X[: n_samples // 2]
+    y_train, y_test = y[n_samples // 2 :], y[: n_samples // 2]
+
+    s_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7)
+    s_clf.fit(X_train, y_train)
+    assert_almost_equal(s_clf.dual_gap_, 0, 4)
+    assert s_clf.score(X_test, y_test) > 0.85
+
+    # check the convergence is the same as the dense version
+    d_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7)
+    d_clf.fit(X_train.toarray(), y_train)
+    assert_almost_equal(d_clf.dual_gap_, 0, 4)
+    assert d_clf.score(X_test, y_test) > 0.85
+
+    # check that the coefs are sparse
+    assert np.sum(s_clf.coef_ != 0.0) == n_informative
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_enet_multitarget(csc_container):
+    n_targets = 3
+    X, y = make_sparse_data(csc_container, n_targets=n_targets)
+
+    estimator = ElasticNet(alpha=0.01, precompute=False)
+    # XXX: There is a bug when precompute is not False!
+    estimator.fit(X, y)
+    coef, intercept, dual_gap = (
+        estimator.coef_,
+        estimator.intercept_,
+        estimator.dual_gap_,
+    )
+
+    for k in range(n_targets):
+        estimator.fit(X, y[:, k])
+        assert_array_almost_equal(coef[k, :], estimator.coef_)
+        assert_array_almost_equal(intercept[k], estimator.intercept_)
+        assert_array_almost_equal(dual_gap[k], estimator.dual_gap_)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_path_parameters(csc_container):
+    X, y = make_sparse_data(csc_container)
+    max_iter = 50
+    n_alphas = 10
+    clf = ElasticNetCV(
+        n_alphas=n_alphas,
+        eps=1e-3,
+        max_iter=max_iter,
+        l1_ratio=0.5,
+        fit_intercept=False,
+    )
+    ignore_warnings(clf.fit)(X, y)  # new params
+    assert_almost_equal(0.5, clf.l1_ratio)
+    assert n_alphas == clf.n_alphas
+    assert n_alphas == len(clf.alphas_)
+    sparse_mse_path = clf.mse_path_
+    ignore_warnings(clf.fit)(X.toarray(), y)  # compare with dense data
+    assert_almost_equal(clf.mse_path_, sparse_mse_path)
+
+
+@pytest.mark.parametrize("Model", [Lasso, ElasticNet, LassoCV, ElasticNetCV])
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("n_samples, n_features", [(24, 6), (6, 24)])
+@pytest.mark.parametrize("with_sample_weight", [True, False])
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_sparse_dense_equality(
+    Model, fit_intercept, n_samples, n_features, with_sample_weight, csc_container
+):
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        effective_rank=n_features // 2,
+        n_informative=n_features // 2,
+        bias=4 * fit_intercept,
+        noise=1,
+        random_state=42,
+    )
+    if with_sample_weight:
+        sw = np.abs(np.random.RandomState(42).normal(scale=10, size=y.shape))
+    else:
+        sw = None
+    Xs = csc_container(X)
+    params = {"fit_intercept": fit_intercept}
+    reg_dense = Model(**params).fit(X, y, sample_weight=sw)
+    reg_sparse = Model(**params).fit(Xs, y, sample_weight=sw)
+    if fit_intercept:
+        assert reg_sparse.intercept_ == pytest.approx(reg_dense.intercept_)
+        # balance property
+        assert np.average(reg_sparse.predict(X), weights=sw) == pytest.approx(
+            np.average(y, weights=sw)
+        )
+    assert_allclose(reg_sparse.coef_, reg_dense.coef_)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_same_output_sparse_dense_lasso_and_enet_cv(csc_container):
+    X, y = make_sparse_data(csc_container, n_samples=40, n_features=10)
+    clfs = ElasticNetCV(max_iter=100)
+    clfs.fit(X, y)
+    clfd = ElasticNetCV(max_iter=100)
+    clfd.fit(X.toarray(), y)
+    assert_almost_equal(clfs.alpha_, clfd.alpha_, 7)
+    assert_almost_equal(clfs.intercept_, clfd.intercept_, 7)
+    assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_)
+    assert_array_almost_equal(clfs.alphas_, clfd.alphas_)
+
+    clfs = LassoCV(max_iter=100, cv=4)
+    clfs.fit(X, y)
+    clfd = LassoCV(max_iter=100, cv=4)
+    clfd.fit(X.toarray(), y)
+    assert_almost_equal(clfs.alpha_, clfd.alpha_, 7)
+    assert_almost_equal(clfs.intercept_, clfd.intercept_, 7)
+    assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_)
+    assert_array_almost_equal(clfs.alphas_, clfd.alphas_)
+
+
+@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
+def test_same_multiple_output_sparse_dense(coo_container):
+    l = ElasticNet()
+    X = [
+        [0, 1, 2, 3, 4],
+        [0, 2, 5, 8, 11],
+        [9, 10, 11, 12, 13],
+        [10, 11, 12, 13, 14],
+    ]
+    y = [
+        [1, 2, 3, 4, 5],
+        [1, 3, 6, 9, 12],
+        [10, 11, 12, 13, 14],
+        [11, 12, 13, 14, 15],
+    ]
+    l.fit(X, y)
+    sample = np.array([1, 2, 3, 4, 5]).reshape(1, -1)
+    predict_dense = l.predict(sample)
+
+    l_sp = ElasticNet()
+    X_sp = coo_container(X)
+    l_sp.fit(X_sp, y)
+    sample_sparse = coo_container(sample)
+    predict_sparse = l_sp.predict(sample_sparse)
+
+    assert_array_almost_equal(predict_sparse, predict_dense)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_sparse_enet_coordinate_descent(csc_container):
+    """Test that a warning is issued if model does not converge"""
+    clf = Lasso(max_iter=2)
+    n_samples = 5
+    n_features = 2
+    X = csc_container((n_samples, n_features)) * 1e50
+    y = np.ones(n_samples)
+    warning_message = (
+        "Objective did not converge. You might want "
+        "to increase the number of iterations."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        clf.fit(X, y)
+
+
+@pytest.mark.parametrize("copy_X", (True, False))
+def test_sparse_read_only_buffer(copy_X):
+    """Test that sparse coordinate descent works for read-only buffers"""
+    rng = np.random.RandomState(0)
+
+    clf = ElasticNet(alpha=0.1, copy_X=copy_X, random_state=rng)
+    X = sp.random(100, 20, format="csc", random_state=rng)
+
+    # Make X.data read-only
+    X.data = create_memmap_backed_data(X.data)
+
+    y = rng.rand(100)
+    clf.fit(X, y)
@@ -0,0 +1,294 @@
+"""
+Testing for Theil-Sen module (sklearn.linear_model.theil_sen)
+"""
+
+# Author: Florian Wilhelm <florian.wilhelm@gmail.com>
+# License: BSD 3 clause
+import os
+import re
+import sys
+from contextlib import contextmanager
+
+import numpy as np
+import pytest
+from numpy.testing import (
+    assert_array_almost_equal,
+    assert_array_equal,
+    assert_array_less,
+)
+from scipy.linalg import norm
+from scipy.optimize import fmin_bfgs
+
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import LinearRegression, TheilSenRegressor
+from sklearn.linear_model._theil_sen import (
+    _breakdown_point,
+    _modified_weiszfeld_step,
+    _spatial_median,
+)
+from sklearn.utils._testing import assert_almost_equal
+
+
+@contextmanager
+def no_stdout_stderr():
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr
+    with open(os.devnull, "w") as devnull:
+        sys.stdout = devnull
+        sys.stderr = devnull
+        yield
+        devnull.flush()
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr
+
+
+def gen_toy_problem_1d(intercept=True):
+    random_state = np.random.RandomState(0)
+    # Linear model y = 3*x + N(2, 0.1**2)
+    w = 3.0
+    if intercept:
+        c = 2.0
+        n_samples = 50
+    else:
+        c = 0.1
+        n_samples = 100
+    x = random_state.normal(size=n_samples)
+    noise = 0.1 * random_state.normal(size=n_samples)
+    y = w * x + c + noise
+    # Add some outliers
+    if intercept:
+        x[42], y[42] = (-2, 4)
+        x[43], y[43] = (-2.5, 8)
+        x[33], y[33] = (2.5, 1)
+        x[49], y[49] = (2.1, 2)
+    else:
+        x[42], y[42] = (-2, 4)
+        x[43], y[43] = (-2.5, 8)
+        x[53], y[53] = (2.5, 1)
+        x[60], y[60] = (2.1, 2)
+        x[72], y[72] = (1.8, -7)
+    return x[:, np.newaxis], y, w, c
+
+
+def gen_toy_problem_2d():
+    random_state = np.random.RandomState(0)
+    n_samples = 100
+    # Linear model y = 5*x_1 + 10*x_2 + N(1, 0.1**2)
+    X = random_state.normal(size=(n_samples, 2))
+    w = np.array([5.0, 10.0])
+    c = 1.0
+    noise = 0.1 * random_state.normal(size=n_samples)
+    y = np.dot(X, w) + c + noise
+    # Add some outliers
+    n_outliers = n_samples // 10
+    ix = random_state.randint(0, n_samples, size=n_outliers)
+    y[ix] = 50 * random_state.normal(size=n_outliers)
+    return X, y, w, c
+
+
+def gen_toy_problem_4d():
+    random_state = np.random.RandomState(0)
+    n_samples = 10000
+    # Linear model y = 5*x_1 + 10*x_2  + 42*x_3 + 7*x_4 + N(1, 0.1**2)
+    X = random_state.normal(size=(n_samples, 4))
+    w = np.array([5.0, 10.0, 42.0, 7.0])
+    c = 1.0
+    noise = 0.1 * random_state.normal(size=n_samples)
+    y = np.dot(X, w) + c + noise
+    # Add some outliers
+    n_outliers = n_samples // 10
+    ix = random_state.randint(0, n_samples, size=n_outliers)
+    y[ix] = 50 * random_state.normal(size=n_outliers)
+    return X, y, w, c
+
+
+def test_modweiszfeld_step_1d():
+    X = np.array([1.0, 2.0, 3.0]).reshape(3, 1)
+    # Check startvalue is element of X and solution
+    median = 2.0
+    new_y = _modified_weiszfeld_step(X, median)
+    assert_array_almost_equal(new_y, median)
+    # Check startvalue is not the solution
+    y = 2.5
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_less(median, new_y)
+    assert_array_less(new_y, y)
+    # Check startvalue is not the solution but element of X
+    y = 3.0
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_less(median, new_y)
+    assert_array_less(new_y, y)
+    # Check that a single vector is identity
+    X = np.array([1.0, 2.0, 3.0]).reshape(1, 3)
+    y = X[0]
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_equal(y, new_y)
+
+
+def test_modweiszfeld_step_2d():
+    X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2)
+    y = np.array([0.5, 0.5])
+    # Check first two iterations
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_almost_equal(new_y, np.array([1 / 3, 2 / 3]))
+    new_y = _modified_weiszfeld_step(X, new_y)
+    assert_array_almost_equal(new_y, np.array([0.2792408, 0.7207592]))
+    # Check fix point
+    y = np.array([0.21132505, 0.78867497])
+    new_y = _modified_weiszfeld_step(X, y)
+    assert_array_almost_equal(new_y, y)
+
+
+def test_spatial_median_1d():
+    X = np.array([1.0, 2.0, 3.0]).reshape(3, 1)
+    true_median = 2.0
+    _, median = _spatial_median(X)
+    assert_array_almost_equal(median, true_median)
+    # Test larger problem and for exact solution in 1d case
+    random_state = np.random.RandomState(0)
+    X = random_state.randint(100, size=(1000, 1))
+    true_median = np.median(X.ravel())
+    _, median = _spatial_median(X)
+    assert_array_equal(median, true_median)
+
+
+def test_spatial_median_2d():
+    X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2)
+    _, median = _spatial_median(X, max_iter=100, tol=1.0e-6)
+
+    def cost_func(y):
+        dists = np.array([norm(x - y) for x in X])
+        return np.sum(dists)
+
+    # Check if median is solution of the Fermat-Weber location problem
+    fermat_weber = fmin_bfgs(cost_func, median, disp=False)
+    assert_array_almost_equal(median, fermat_weber)
+    # Check when maximum iteration is exceeded a warning is emitted
+    warning_message = "Maximum number of iterations 30 reached in spatial median."
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        _spatial_median(X, max_iter=30, tol=0.0)
+
+
+def test_theil_sen_1d():
+    X, y, w, c = gen_toy_problem_1d()
+    # Check that Least Squares fails
+    lstq = LinearRegression().fit(X, y)
+    assert np.abs(lstq.coef_ - w) > 0.9
+    # Check that Theil-Sen works
+    theil_sen = TheilSenRegressor(random_state=0).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, w, 1)
+    assert_array_almost_equal(theil_sen.intercept_, c, 1)
+
+
+def test_theil_sen_1d_no_intercept():
+    X, y, w, c = gen_toy_problem_1d(intercept=False)
+    # Check that Least Squares fails
+    lstq = LinearRegression(fit_intercept=False).fit(X, y)
+    assert np.abs(lstq.coef_ - w - c) > 0.5
+    # Check that Theil-Sen works
+    theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, w + c, 1)
+    assert_almost_equal(theil_sen.intercept_, 0.0)
+
+    # non-regression test for #18104
+    theil_sen.score(X, y)
+
+
+def test_theil_sen_2d():
+    X, y, w, c = gen_toy_problem_2d()
+    # Check that Least Squares fails
+    lstq = LinearRegression().fit(X, y)
+    assert norm(lstq.coef_ - w) > 1.0
+    # Check that Theil-Sen works
+    theil_sen = TheilSenRegressor(max_subpopulation=1e3, random_state=0).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, w, 1)
+    assert_array_almost_equal(theil_sen.intercept_, c, 1)
+
+
+def test_calc_breakdown_point():
+    bp = _breakdown_point(1e10, 2)
+    assert np.abs(bp - 1 + 1 / (np.sqrt(2))) < 1.0e-6
+
+
+@pytest.mark.parametrize(
+    "param, ExceptionCls, match",
+    [
+        (
+            {"n_subsamples": 1},
+            ValueError,
+            re.escape("Invalid parameter since n_features+1 > n_subsamples (2 > 1)"),
+        ),
+        (
+            {"n_subsamples": 101},
+            ValueError,
+            re.escape("Invalid parameter since n_subsamples > n_samples (101 > 50)"),
+        ),
+    ],
+)
+def test_checksubparams_invalid_input(param, ExceptionCls, match):
+    X, y, w, c = gen_toy_problem_1d()
+    theil_sen = TheilSenRegressor(**param, random_state=0)
+    with pytest.raises(ExceptionCls, match=match):
+        theil_sen.fit(X, y)
+
+
+def test_checksubparams_n_subsamples_if_less_samples_than_features():
+    random_state = np.random.RandomState(0)
+    n_samples, n_features = 10, 20
+    X = random_state.normal(size=(n_samples, n_features))
+    y = random_state.normal(size=n_samples)
+    theil_sen = TheilSenRegressor(n_subsamples=9, random_state=0)
+    with pytest.raises(ValueError):
+        theil_sen.fit(X, y)
+
+
+def test_subpopulation():
+    X, y, w, c = gen_toy_problem_4d()
+    theil_sen = TheilSenRegressor(max_subpopulation=250, random_state=0).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, w, 1)
+    assert_array_almost_equal(theil_sen.intercept_, c, 1)
+
+
+def test_subsamples():
+    X, y, w, c = gen_toy_problem_4d()
+    theil_sen = TheilSenRegressor(n_subsamples=X.shape[0], random_state=0).fit(X, y)
+    lstq = LinearRegression().fit(X, y)
+    # Check for exact the same results as Least Squares
+    assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 9)
+
+
+def test_verbosity():
+    X, y, w, c = gen_toy_problem_1d()
+    # Check that Theil-Sen can be verbose
+    with no_stdout_stderr():
+        TheilSenRegressor(verbose=True, random_state=0).fit(X, y)
+        TheilSenRegressor(verbose=True, max_subpopulation=10, random_state=0).fit(X, y)
+
+
+def test_theil_sen_parallel():
+    X, y, w, c = gen_toy_problem_2d()
+    # Check that Least Squares fails
+    lstq = LinearRegression().fit(X, y)
+    assert norm(lstq.coef_ - w) > 1.0
+    # Check that Theil-Sen works
+    theil_sen = TheilSenRegressor(n_jobs=2, random_state=0, max_subpopulation=2e3).fit(
+        X, y
+    )
+    assert_array_almost_equal(theil_sen.coef_, w, 1)
+    assert_array_almost_equal(theil_sen.intercept_, c, 1)
+
+
+def test_less_samples_than_features():
+    random_state = np.random.RandomState(0)
+    n_samples, n_features = 10, 20
+    X = random_state.normal(size=(n_samples, n_features))
+    y = random_state.normal(size=n_samples)
+    # Check that Theil-Sen falls back to Least Squares if fit_intercept=False
+    theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y)
+    lstq = LinearRegression(fit_intercept=False).fit(X, y)
+    assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 12)
+    # Check fit_intercept=True case. This will not be equal to the Least
+    # Squares solution since the intercept is calculated differently.
+    theil_sen = TheilSenRegressor(fit_intercept=True, random_state=0).fit(X, y)
+    y_pred = theil_sen.predict(X)
+    assert_array_almost_equal(y_pred, y, 12)