feat: initial commit - Phase 1 & 2 core features

This commit is contained in:
hiderfong
2026-04-22 17:07:33 +08:00
commit 1773bda06b
25005 changed files with 6252106 additions and 0 deletions
@@ -0,0 +1,546 @@
"""
Testing for export functions of decision trees (sklearn.tree.export).
"""
from io import StringIO
from re import finditer, search
from textwrap import dedent
import numpy as np
import pytest
from numpy.random import RandomState
from sklearn.base import is_classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.exceptions import NotFittedError
from sklearn.tree import (
DecisionTreeClassifier,
DecisionTreeRegressor,
export_graphviz,
export_text,
plot_tree,
)
# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [-1, -1, -1, 1, 1, 1]
y2 = [[-1, 1], [-1, 1], [-1, 1], [1, 2], [1, 2], [1, 3]]
w = [1, 1, 1, 0.5, 0.5, 0.5]
y_degraded = [1, 1, 1, 1, 1, 1]
def test_graphviz_toy():
# Check correctness of export_graphviz
clf = DecisionTreeClassifier(
max_depth=3, min_samples_split=2, criterion="gini", random_state=2
)
clf.fit(X, y)
# Test export code
contents1 = export_graphviz(clf, out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]"] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test plot_options
contents1 = export_graphviz(
clf,
filled=True,
impurity=False,
proportion=True,
special_characters=True,
rounded=True,
out_file=None,
fontname="sans",
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled, rounded", color="black", '
'fontname="sans"] ;\n'
'edge [fontname="sans"] ;\n'
"0 [label=<x<SUB>0</SUB> &le; 0.0<br/>samples = 100.0%<br/>"
'value = [0.5, 0.5]>, fillcolor="#ffffff"] ;\n'
"1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, "
'fillcolor="#e58139"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
"2 [label=<samples = 50.0%<br/>value = [0.0, 1.0]>, "
'fillcolor="#399de5"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test max_depth
contents1 = export_graphviz(clf, max_depth=0, class_names=True, out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]\\nclass = y[0]"] ;\n'
'1 [label="(...)"] ;\n'
"0 -> 1 ;\n"
'2 [label="(...)"] ;\n'
"0 -> 2 ;\n"
"}"
)
assert contents1 == contents2
# Test max_depth with plot_options
contents1 = export_graphviz(
clf, max_depth=0, filled=True, out_file=None, node_ids=True
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled", color="black", '
'fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="node #0\\nx[0] <= 0.0\\ngini = 0.5\\n'
'samples = 6\\nvalue = [3, 3]", fillcolor="#ffffff"] ;\n'
'1 [label="(...)", fillcolor="#C0C0C0"] ;\n'
"0 -> 1 ;\n"
'2 [label="(...)", fillcolor="#C0C0C0"] ;\n'
"0 -> 2 ;\n"
"}"
)
assert contents1 == contents2
# Test multi-output with weighted samples
clf = DecisionTreeClassifier(
max_depth=2, min_samples_split=2, criterion="gini", random_state=2
)
clf = clf.fit(X, y2, sample_weight=w)
contents1 = export_graphviz(clf, filled=True, impurity=False, out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled", color="black", '
'fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\nsamples = 6\\n'
"value = [[3.0, 1.5, 0.0]\\n"
'[3.0, 1.0, 0.5]]", fillcolor="#ffffff"] ;\n'
'1 [label="samples = 3\\nvalue = [[3, 0, 0]\\n'
'[3, 0, 0]]", fillcolor="#e58139"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="x[0] <= 1.5\\nsamples = 3\\n'
"value = [[0.0, 1.5, 0.0]\\n"
'[0.0, 1.0, 0.5]]", fillcolor="#f1bd97"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
'3 [label="samples = 2\\nvalue = [[0, 1, 0]\\n'
'[0, 1, 0]]", fillcolor="#e58139"] ;\n'
"2 -> 3 ;\n"
'4 [label="samples = 1\\nvalue = [[0.0, 0.5, 0.0]\\n'
'[0.0, 0.0, 0.5]]", fillcolor="#e58139"] ;\n'
"2 -> 4 ;\n"
"}"
)
assert contents1 == contents2
# Test regression output with plot_options
clf = DecisionTreeRegressor(
max_depth=3, min_samples_split=2, criterion="squared_error", random_state=2
)
clf.fit(X, y)
contents1 = export_graphviz(
clf,
filled=True,
leaves_parallel=True,
out_file=None,
rotate=True,
rounded=True,
fontname="sans",
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled, rounded", color="black", '
'fontname="sans"] ;\n'
"graph [ranksep=equally, splines=polyline] ;\n"
'edge [fontname="sans"] ;\n'
"rankdir=LR ;\n"
'0 [label="x[0] <= 0.0\\nsquared_error = 1.0\\nsamples = 6\\n'
'value = 0.0", fillcolor="#f2c09c"] ;\n'
'1 [label="squared_error = 0.0\\nsamples = 3\\'
'nvalue = -1.0", '
'fillcolor="#ffffff"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=-45, "
'headlabel="True"] ;\n'
'2 [label="squared_error = 0.0\\nsamples = 3\\nvalue = 1.0", '
'fillcolor="#e58139"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=45, "
'headlabel="False"] ;\n'
"{rank=same ; 0} ;\n"
"{rank=same ; 1; 2} ;\n"
"}"
)
assert contents1 == contents2
# Test classifier with degraded learning set
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X, y_degraded)
contents1 = export_graphviz(clf, filled=True, out_file=None)
contents2 = (
"digraph Tree {\n"
'node [shape=box, style="filled", color="black", '
'fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="gini = 0.0\\nsamples = 6\\nvalue = 6.0", '
'fillcolor="#ffffff"] ;\n'
"}"
)
@pytest.mark.parametrize("constructor", [list, np.array])
def test_graphviz_feature_class_names_array_support(constructor):
# Check that export_graphviz treats feature names
# and class names correctly and supports arrays
clf = DecisionTreeClassifier(
max_depth=3, min_samples_split=2, criterion="gini", random_state=2
)
clf.fit(X, y)
# Test with feature_names
contents1 = export_graphviz(
clf, feature_names=constructor(["feature0", "feature1"]), out_file=None
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]"] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
# Test with class_names
contents1 = export_graphviz(
clf, class_names=constructor(["yes", "no"]), out_file=None
)
contents2 = (
"digraph Tree {\n"
'node [shape=box, fontname="helvetica"] ;\n'
'edge [fontname="helvetica"] ;\n'
'0 [label="x[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n'
'value = [3, 3]\\nclass = yes"] ;\n'
'1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n'
'class = yes"] ;\n'
"0 -> 1 [labeldistance=2.5, labelangle=45, "
'headlabel="True"] ;\n'
'2 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]\\n'
'class = no"] ;\n'
"0 -> 2 [labeldistance=2.5, labelangle=-45, "
'headlabel="False"] ;\n'
"}"
)
assert contents1 == contents2
def test_graphviz_errors():
# Check for errors of export_graphviz
clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2)
# Check not-fitted decision tree error
out = StringIO()
with pytest.raises(NotFittedError):
export_graphviz(clf, out)
clf.fit(X, y)
# Check if it errors when length of feature_names
# mismatches with number of features
message = "Length of feature_names, 1 does not match number of features, 2"
with pytest.raises(ValueError, match=message):
export_graphviz(clf, None, feature_names=["a"])
message = "Length of feature_names, 3 does not match number of features, 2"
with pytest.raises(ValueError, match=message):
export_graphviz(clf, None, feature_names=["a", "b", "c"])
# Check error when argument is not an estimator
message = "is not an estimator instance"
with pytest.raises(TypeError, match=message):
export_graphviz(clf.fit(X, y).tree_)
# Check class_names error
out = StringIO()
with pytest.raises(IndexError):
export_graphviz(clf, out, class_names=[])
def test_friedman_mse_in_graphviz():
clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
clf.fit(X, y)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data)
clf = GradientBoostingClassifier(n_estimators=2, random_state=0)
clf.fit(X, y)
for estimator in clf.estimators_:
export_graphviz(estimator[0], out_file=dot_data)
for finding in finditer(r"\[.*?samples.*?\]", dot_data.getvalue()):
assert "friedman_mse" in finding.group()
def test_precision():
rng_reg = RandomState(2)
rng_clf = RandomState(8)
for X, y, clf in zip(
(rng_reg.random_sample((5, 2)), rng_clf.random_sample((1000, 4))),
(rng_reg.random_sample((5,)), rng_clf.randint(2, size=(1000,))),
(
DecisionTreeRegressor(
criterion="friedman_mse", random_state=0, max_depth=1
),
DecisionTreeClassifier(max_depth=1, random_state=0),
),
):
clf.fit(X, y)
for precision in (4, 3):
dot_data = export_graphviz(
clf, out_file=None, precision=precision, proportion=True
)
# With the current random state, the impurity and the threshold
# will have the number of precision set in the export_graphviz
# function. We will check the number of precision with a strict
# equality. The value reported will have only 2 precision and
# therefore, only a less equal comparison will be done.
# check value
for finding in finditer(r"value = \d+\.\d+", dot_data):
assert len(search(r"\.\d+", finding.group()).group()) <= precision + 1
# check impurity
if is_classifier(clf):
pattern = r"gini = \d+\.\d+"
else:
pattern = r"friedman_mse = \d+\.\d+"
# check impurity
for finding in finditer(pattern, dot_data):
assert len(search(r"\.\d+", finding.group()).group()) == precision + 1
# check threshold
for finding in finditer(r"<= \d+\.\d+", dot_data):
assert len(search(r"\.\d+", finding.group()).group()) == precision + 1
def test_export_text_errors():
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
err_msg = "feature_names must contain 2 elements, got 1"
with pytest.raises(ValueError, match=err_msg):
export_text(clf, feature_names=["a"])
err_msg = (
"When `class_names` is an array, it should contain as"
" many items as `decision_tree.classes_`. Got 1 while"
" the tree was fitted with 2 classes."
)
with pytest.raises(ValueError, match=err_msg):
export_text(clf, class_names=["a"])
def test_export_text():
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
expected_report = dedent(
"""
|--- feature_1 <= 0.00
| |--- class: -1
|--- feature_1 > 0.00
| |--- class: 1
"""
).lstrip()
assert export_text(clf) == expected_report
# testing that leaves at level 1 are not truncated
assert export_text(clf, max_depth=0) == expected_report
# testing that the rest of the tree is truncated
assert export_text(clf, max_depth=10) == expected_report
expected_report = dedent(
"""
|--- feature_1 <= 0.00
| |--- weights: [3.00, 0.00] class: -1
|--- feature_1 > 0.00
| |--- weights: [0.00, 3.00] class: 1
"""
).lstrip()
assert export_text(clf, show_weights=True) == expected_report
expected_report = dedent(
"""
|- feature_1 <= 0.00
| |- class: -1
|- feature_1 > 0.00
| |- class: 1
"""
).lstrip()
assert export_text(clf, spacing=1) == expected_report
X_l = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, 1]]
y_l = [-1, -1, -1, 1, 1, 1, 2]
clf = DecisionTreeClassifier(max_depth=4, random_state=0)
clf.fit(X_l, y_l)
expected_report = dedent(
"""
|--- feature_1 <= 0.00
| |--- class: -1
|--- feature_1 > 0.00
| |--- truncated branch of depth 2
"""
).lstrip()
assert export_text(clf, max_depth=0) == expected_report
X_mo = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y_mo = [[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1], [1, 1]]
reg = DecisionTreeRegressor(max_depth=2, random_state=0)
reg.fit(X_mo, y_mo)
expected_report = dedent(
"""
|--- feature_1 <= 0.0
| |--- value: [-1.0, -1.0]
|--- feature_1 > 0.0
| |--- value: [1.0, 1.0]
"""
).lstrip()
assert export_text(reg, decimals=1) == expected_report
assert export_text(reg, decimals=1, show_weights=True) == expected_report
X_single = [[-2], [-1], [-1], [1], [1], [2]]
reg = DecisionTreeRegressor(max_depth=2, random_state=0)
reg.fit(X_single, y_mo)
expected_report = dedent(
"""
|--- first <= 0.0
| |--- value: [-1.0, -1.0]
|--- first > 0.0
| |--- value: [1.0, 1.0]
"""
).lstrip()
assert export_text(reg, decimals=1, feature_names=["first"]) == expected_report
assert (
export_text(reg, decimals=1, show_weights=True, feature_names=["first"])
== expected_report
)
@pytest.mark.parametrize("constructor", [list, np.array])
def test_export_text_feature_class_names_array_support(constructor):
# Check that export_graphviz treats feature names
# and class names correctly and supports arrays
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
expected_report = dedent(
"""
|--- b <= 0.00
| |--- class: -1
|--- b > 0.00
| |--- class: 1
"""
).lstrip()
assert export_text(clf, feature_names=constructor(["a", "b"])) == expected_report
expected_report = dedent(
"""
|--- feature_1 <= 0.00
| |--- class: cat
|--- feature_1 > 0.00
| |--- class: dog
"""
).lstrip()
assert export_text(clf, class_names=constructor(["cat", "dog"])) == expected_report
def test_plot_tree_entropy(pyplot):
# mostly smoke tests
# Check correctness of export_graphviz for criterion = entropy
clf = DecisionTreeClassifier(
max_depth=3, min_samples_split=2, criterion="entropy", random_state=2
)
clf.fit(X, y)
# Test export code
feature_names = ["first feat", "sepal_width"]
nodes = plot_tree(clf, feature_names=feature_names)
assert len(nodes) == 5
assert (
nodes[0].get_text()
== "first feat <= 0.0\nentropy = 1.0\nsamples = 6\nvalue = [3, 3]"
)
assert nodes[1].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [3, 0]"
assert nodes[2].get_text() == "True "
assert nodes[3].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [0, 3]"
assert nodes[4].get_text() == " False"
@pytest.mark.parametrize("fontsize", [None, 10, 20])
def test_plot_tree_gini(pyplot, fontsize):
# mostly smoke tests
# Check correctness of export_graphviz for criterion = gini
clf = DecisionTreeClassifier(
max_depth=3,
min_samples_split=2,
criterion="gini",
random_state=2,
)
clf.fit(X, y)
# Test export code
feature_names = ["first feat", "sepal_width"]
nodes = plot_tree(clf, feature_names=feature_names, fontsize=fontsize)
assert len(nodes) == 5
if fontsize is not None:
assert all(node.get_fontsize() == fontsize for node in nodes)
assert (
nodes[0].get_text()
== "first feat <= 0.0\ngini = 0.5\nsamples = 6\nvalue = [3, 3]"
)
assert nodes[1].get_text() == "gini = 0.0\nsamples = 3\nvalue = [3, 0]"
assert nodes[2].get_text() == "True "
assert nodes[3].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]"
assert nodes[4].get_text() == " False"
def test_not_fitted_tree(pyplot):
# Testing if not fitted tree throws the correct error
clf = DecisionTreeRegressor()
with pytest.raises(NotFittedError):
plot_tree(clf)
@@ -0,0 +1,508 @@
import numpy as np
import pytest
from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import (
ExtraTreesClassifier,
ExtraTreesRegressor,
RandomForestClassifier,
RandomForestRegressor,
)
from sklearn.tree import (
DecisionTreeClassifier,
DecisionTreeRegressor,
ExtraTreeClassifier,
ExtraTreeRegressor,
)
from sklearn.utils._testing import assert_allclose
from sklearn.utils.fixes import CSC_CONTAINERS
TREE_CLASSIFIER_CLASSES = [DecisionTreeClassifier, ExtraTreeClassifier]
TREE_REGRESSOR_CLASSES = [DecisionTreeRegressor, ExtraTreeRegressor]
TREE_BASED_CLASSIFIER_CLASSES = TREE_CLASSIFIER_CLASSES + [
RandomForestClassifier,
ExtraTreesClassifier,
]
TREE_BASED_REGRESSOR_CLASSES = TREE_REGRESSOR_CLASSES + [
RandomForestRegressor,
ExtraTreesRegressor,
]
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
@pytest.mark.parametrize("depth_first_builder", (True, False))
@pytest.mark.parametrize("sparse_splitter", (True, False))
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_monotonic_constraints_classifications(
TreeClassifier,
depth_first_builder,
sparse_splitter,
global_random_seed,
csc_container,
):
n_samples = 1000
n_samples_train = 900
X, y = make_classification(
n_samples=n_samples,
n_classes=2,
n_features=5,
n_informative=5,
n_redundant=0,
random_state=global_random_seed,
)
X_train, y_train = X[:n_samples_train], y[:n_samples_train]
X_test, _ = X[n_samples_train:], y[n_samples_train:]
X_test_0incr, X_test_0decr = np.copy(X_test), np.copy(X_test)
X_test_1incr, X_test_1decr = np.copy(X_test), np.copy(X_test)
X_test_0incr[:, 0] += 10
X_test_0decr[:, 0] -= 10
X_test_1incr[:, 1] += 10
X_test_1decr[:, 1] -= 10
monotonic_cst = np.zeros(X.shape[1])
monotonic_cst[0] = 1
monotonic_cst[1] = -1
if depth_first_builder:
est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst)
else:
est = TreeClassifier(
max_depth=None,
monotonic_cst=monotonic_cst,
max_leaf_nodes=n_samples_train,
)
if hasattr(est, "random_state"):
est.set_params(**{"random_state": global_random_seed})
if hasattr(est, "n_estimators"):
est.set_params(**{"n_estimators": 5})
if sparse_splitter:
X_train = csc_container(X_train)
est.fit(X_train, y_train)
proba_test = est.predict_proba(X_test)
assert np.logical_and(
proba_test >= 0.0, proba_test <= 1.0
).all(), "Probability should always be in [0, 1] range."
assert_allclose(proba_test.sum(axis=1), 1.0)
# Monotonic increase constraint, it applies to the positive class
assert np.all(est.predict_proba(X_test_0incr)[:, 1] >= proba_test[:, 1])
assert np.all(est.predict_proba(X_test_0decr)[:, 1] <= proba_test[:, 1])
# Monotonic decrease constraint, it applies to the positive class
assert np.all(est.predict_proba(X_test_1incr)[:, 1] <= proba_test[:, 1])
assert np.all(est.predict_proba(X_test_1decr)[:, 1] >= proba_test[:, 1])
@pytest.mark.parametrize("TreeRegressor", TREE_BASED_REGRESSOR_CLASSES)
@pytest.mark.parametrize("depth_first_builder", (True, False))
@pytest.mark.parametrize("sparse_splitter", (True, False))
@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_monotonic_constraints_regressions(
TreeRegressor,
depth_first_builder,
sparse_splitter,
criterion,
global_random_seed,
csc_container,
):
n_samples = 1000
n_samples_train = 900
# Build a regression task using 5 informative features
X, y = make_regression(
n_samples=n_samples,
n_features=5,
n_informative=5,
random_state=global_random_seed,
)
train = np.arange(n_samples_train)
test = np.arange(n_samples_train, n_samples)
X_train = X[train]
y_train = y[train]
X_test = np.copy(X[test])
X_test_incr = np.copy(X_test)
X_test_decr = np.copy(X_test)
X_test_incr[:, 0] += 10
X_test_decr[:, 1] += 10
monotonic_cst = np.zeros(X.shape[1])
monotonic_cst[0] = 1
monotonic_cst[1] = -1
if depth_first_builder:
est = TreeRegressor(
max_depth=None,
monotonic_cst=monotonic_cst,
criterion=criterion,
)
else:
est = TreeRegressor(
max_depth=8,
monotonic_cst=monotonic_cst,
criterion=criterion,
max_leaf_nodes=n_samples_train,
)
if hasattr(est, "random_state"):
est.set_params(random_state=global_random_seed)
if hasattr(est, "n_estimators"):
est.set_params(**{"n_estimators": 5})
if sparse_splitter:
X_train = csc_container(X_train)
est.fit(X_train, y_train)
y = est.predict(X_test)
# Monotonic increase constraint
y_incr = est.predict(X_test_incr)
# y_incr should always be greater than y
assert np.all(y_incr >= y)
# Monotonic decrease constraint
y_decr = est.predict(X_test_decr)
# y_decr should always be lower than y
assert np.all(y_decr <= y)
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
def test_multiclass_raises(TreeClassifier):
X, y = make_classification(
n_samples=100, n_features=5, n_classes=3, n_informative=3, random_state=0
)
y[0] = 0
monotonic_cst = np.zeros(X.shape[1])
monotonic_cst[0] = -1
monotonic_cst[1] = 1
est = TreeClassifier(max_depth=None, monotonic_cst=monotonic_cst, random_state=0)
msg = "Monotonicity constraints are not supported with multiclass classification"
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
def test_multiple_output_raises(TreeClassifier):
X = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
y = [[1, 0, 1, 0, 1], [1, 0, 1, 0, 1]]
est = TreeClassifier(
max_depth=None, monotonic_cst=np.array([-1, 1]), random_state=0
)
msg = "Monotonicity constraints are not supported with multiple output"
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
@pytest.mark.parametrize(
"DecisionTreeEstimator", [DecisionTreeClassifier, DecisionTreeRegressor]
)
def test_missing_values_raises(DecisionTreeEstimator):
X, y = make_classification(
n_samples=100, n_features=5, n_classes=2, n_informative=3, random_state=0
)
X[0, 0] = np.nan
monotonic_cst = np.zeros(X.shape[1])
monotonic_cst[0] = 1
est = DecisionTreeEstimator(
max_depth=None, monotonic_cst=monotonic_cst, random_state=0
)
msg = "Input X contains NaN"
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
@pytest.mark.parametrize("TreeClassifier", TREE_BASED_CLASSIFIER_CLASSES)
def test_bad_monotonic_cst_raises(TreeClassifier):
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
y = [1, 0, 1, 0, 1]
msg = "monotonic_cst has shape 3 but the input data X has 2 features."
est = TreeClassifier(
max_depth=None, monotonic_cst=np.array([-1, 1, 0]), random_state=0
)
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
msg = "monotonic_cst must be None or an array-like of -1, 0 or 1."
est = TreeClassifier(
max_depth=None, monotonic_cst=np.array([-2, 2]), random_state=0
)
with pytest.raises(ValueError, match=msg):
est.fit(X, y)
est = TreeClassifier(
max_depth=None, monotonic_cst=np.array([-1, 0.8]), random_state=0
)
with pytest.raises(ValueError, match=msg + "(.*)0.8]"):
est.fit(X, y)
def assert_1d_reg_tree_children_monotonic_bounded(tree_, monotonic_sign):
values = tree_.value
for i in range(tree_.node_count):
if tree_.children_left[i] > i and tree_.children_right[i] > i:
# Check monotonicity on children
i_left = tree_.children_left[i]
i_right = tree_.children_right[i]
if monotonic_sign == 1:
assert values[i_left] <= values[i_right]
elif monotonic_sign == -1:
assert values[i_left] >= values[i_right]
val_middle = (values[i_left] + values[i_right]) / 2
# Check bounds on grand-children, filtering out leaf nodes
if tree_.feature[i_left] >= 0:
i_left_right = tree_.children_right[i_left]
if monotonic_sign == 1:
assert values[i_left_right] <= val_middle
elif monotonic_sign == -1:
assert values[i_left_right] >= val_middle
if tree_.feature[i_right] >= 0:
i_right_left = tree_.children_left[i_right]
if monotonic_sign == 1:
assert val_middle <= values[i_right_left]
elif monotonic_sign == -1:
assert val_middle >= values[i_right_left]
def test_assert_1d_reg_tree_children_monotonic_bounded():
X = np.linspace(-1, 1, 7).reshape(-1, 1)
y = np.sin(2 * np.pi * X.ravel())
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
with pytest.raises(AssertionError):
assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, 1)
with pytest.raises(AssertionError):
assert_1d_reg_tree_children_monotonic_bounded(reg.tree_, -1)
def assert_1d_reg_monotonic(clf, monotonic_sign, min_x, max_x, n_steps):
X_grid = np.linspace(min_x, max_x, n_steps).reshape(-1, 1)
y_pred_grid = clf.predict(X_grid)
if monotonic_sign == 1:
assert (np.diff(y_pred_grid) >= 0.0).all()
elif monotonic_sign == -1:
assert (np.diff(y_pred_grid) <= 0.0).all()
@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
def test_1d_opposite_monotonicity_cst_data(TreeRegressor):
# Check that positive monotonic data with negative monotonic constraint
# yield constant predictions, equal to the average of target values
X = np.linspace(-2, 2, 10).reshape(-1, 1)
y = X.ravel()
clf = TreeRegressor(monotonic_cst=[-1])
clf.fit(X, y)
assert clf.tree_.node_count == 1
assert clf.tree_.value[0] == 0.0
# Swap monotonicity
clf = TreeRegressor(monotonic_cst=[1])
clf.fit(X, -y)
assert clf.tree_.node_count == 1
assert clf.tree_.value[0] == 0.0
@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
@pytest.mark.parametrize("monotonic_sign", (-1, 1))
@pytest.mark.parametrize("depth_first_builder", (True, False))
@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
def test_1d_tree_nodes_values(
TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
):
# Adaptation from test_nodes_values in test_monotonic_constraints.py
# in sklearn.ensemble._hist_gradient_boosting
# Build a single tree with only one feature, and make sure the node
# values respect the monotonicity constraints.
# Considering the following tree with a monotonic +1 constraint, we
# should have:
#
# root
# / \
# a b
# / \ / \
# c d e f
#
# a <= root <= b
# c <= d <= (a + b) / 2 <= e <= f
rng = np.random.RandomState(global_random_seed)
n_samples = 1000
n_features = 1
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples)
if depth_first_builder:
# No max_leaf_nodes, default depth first tree builder
clf = TreeRegressor(
monotonic_cst=[monotonic_sign],
criterion=criterion,
random_state=global_random_seed,
)
else:
# max_leaf_nodes triggers best first tree builder
clf = TreeRegressor(
monotonic_cst=[monotonic_sign],
max_leaf_nodes=n_samples,
criterion=criterion,
random_state=global_random_seed,
)
clf.fit(X, y)
assert_1d_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_sign)
assert_1d_reg_monotonic(clf, monotonic_sign, np.min(X), np.max(X), 100)
def assert_nd_reg_tree_children_monotonic_bounded(tree_, monotonic_cst):
upper_bound = np.full(tree_.node_count, np.inf)
lower_bound = np.full(tree_.node_count, -np.inf)
for i in range(tree_.node_count):
feature = tree_.feature[i]
node_value = tree_.value[i][0][0] # unpack value from nx1x1 array
# While building the tree, the computed middle value is slightly
# different from the average of the siblings values, because
# sum_right / weighted_n_right
# is slightly different from the value of the right sibling.
# This can cause a discrepancy up to numerical noise when clipping,
# which is resolved by comparing with some loss of precision.
assert np.float32(node_value) <= np.float32(upper_bound[i])
assert np.float32(node_value) >= np.float32(lower_bound[i])
if feature < 0:
# Leaf: nothing to do
continue
# Split node: check and update bounds for the children.
i_left = tree_.children_left[i]
i_right = tree_.children_right[i]
# unpack value from nx1x1 array
middle_value = (tree_.value[i_left][0][0] + tree_.value[i_right][0][0]) / 2
if monotonic_cst[feature] == 0:
# Feature without monotonicity constraint: propagate bounds
# down the tree to both children.
# Otherwise, with 2 features and a monotonic increase constraint
# (encoded by +1) on feature 0, the following tree can be accepted,
# although it does not respect the monotonic increase constraint:
#
# X[0] <= 0
# value = 100
# / \
# X[0] <= -1 X[1] <= 0
# value = 50 value = 150
# / \ / \
# leaf leaf leaf leaf
# value = 25 value = 75 value = 50 value = 250
lower_bound[i_left] = lower_bound[i]
upper_bound[i_left] = upper_bound[i]
lower_bound[i_right] = lower_bound[i]
upper_bound[i_right] = upper_bound[i]
elif monotonic_cst[feature] == 1:
# Feature with constraint: check monotonicity
assert tree_.value[i_left] <= tree_.value[i_right]
# Propagate bounds down the tree to both children.
lower_bound[i_left] = lower_bound[i]
upper_bound[i_left] = middle_value
lower_bound[i_right] = middle_value
upper_bound[i_right] = upper_bound[i]
elif monotonic_cst[feature] == -1:
# Feature with constraint: check monotonicity
assert tree_.value[i_left] >= tree_.value[i_right]
# Update and propagate bounds down the tree to both children.
lower_bound[i_left] = middle_value
upper_bound[i_left] = upper_bound[i]
lower_bound[i_right] = lower_bound[i]
upper_bound[i_right] = middle_value
else: # pragma: no cover
raise ValueError(f"monotonic_cst[{feature}]={monotonic_cst[feature]}")
def test_assert_nd_reg_tree_children_monotonic_bounded():
# Check that assert_nd_reg_tree_children_monotonic_bounded can detect
# non-monotonic tree predictions.
X = np.linspace(0, 2 * np.pi, 30).reshape(-1, 1)
y = np.sin(X).ravel()
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
with pytest.raises(AssertionError):
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
with pytest.raises(AssertionError):
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [0])
# Check that assert_nd_reg_tree_children_monotonic_bounded raises
# when the data (and therefore the model) is naturally monotonic in the
# opposite direction.
X = np.linspace(-5, 5, 5).reshape(-1, 1)
y = X.ravel() ** 3
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, y)
with pytest.raises(AssertionError):
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [-1])
# For completeness, check that the converse holds when swapping the sign.
reg = DecisionTreeRegressor(max_depth=None, random_state=0).fit(X, -y)
with pytest.raises(AssertionError):
assert_nd_reg_tree_children_monotonic_bounded(reg.tree_, [1])
@pytest.mark.parametrize("TreeRegressor", TREE_REGRESSOR_CLASSES)
@pytest.mark.parametrize("monotonic_sign", (-1, 1))
@pytest.mark.parametrize("depth_first_builder", (True, False))
@pytest.mark.parametrize("criterion", ("absolute_error", "squared_error"))
def test_nd_tree_nodes_values(
TreeRegressor, monotonic_sign, depth_first_builder, criterion, global_random_seed
):
# Build tree with several features, and make sure the nodes
# values respect the monotonicity constraints.
# Considering the following tree with a monotonic increase constraint on X[0],
# we should have:
#
# root
# X[0]<=t
# / \
# a b
# X[0]<=u X[1]<=v
# / \ / \
# c d e f
#
# i) a <= root <= b
# ii) c <= a <= d <= (a+b)/2
# iii) (a+b)/2 <= min(e,f)
# For iii) we check that each node value is within the proper lower and
# upper bounds.
rng = np.random.RandomState(global_random_seed)
n_samples = 1000
n_features = 2
monotonic_cst = [monotonic_sign, 0]
X = rng.rand(n_samples, n_features)
y = rng.rand(n_samples)
if depth_first_builder:
# No max_leaf_nodes, default depth first tree builder
clf = TreeRegressor(
monotonic_cst=monotonic_cst,
criterion=criterion,
random_state=global_random_seed,
)
else:
# max_leaf_nodes triggers best first tree builder
clf = TreeRegressor(
monotonic_cst=monotonic_cst,
max_leaf_nodes=n_samples,
criterion=criterion,
random_state=global_random_seed,
)
clf.fit(X, y)
assert_nd_reg_tree_children_monotonic_bounded(clf.tree_, monotonic_cst)
@@ -0,0 +1,49 @@
import numpy as np
import pytest
from sklearn.tree._reingold_tilford import Tree, buchheim
simple_tree = Tree("", 0, Tree("", 1), Tree("", 2))
bigger_tree = Tree(
"",
0,
Tree(
"",
1,
Tree("", 3),
Tree("", 4, Tree("", 7), Tree("", 8)),
),
Tree("", 2, Tree("", 5), Tree("", 6)),
)
@pytest.mark.parametrize("tree, n_nodes", [(simple_tree, 3), (bigger_tree, 9)])
def test_buchheim(tree, n_nodes):
def walk_tree(draw_tree):
res = [(draw_tree.x, draw_tree.y)]
for child in draw_tree.children:
# parents higher than children:
assert child.y == draw_tree.y + 1
res.extend(walk_tree(child))
if len(draw_tree.children):
# these trees are always binary
# parents are centered above children
assert (
draw_tree.x == (draw_tree.children[0].x + draw_tree.children[1].x) / 2
)
return res
layout = buchheim(tree)
coordinates = walk_tree(layout)
assert len(coordinates) == n_nodes
# test that x values are unique per depth / level
# we could also do it quicker using defaultdicts..
depth = 0
while True:
x_at_this_depth = [node[0] for node in coordinates if node[1] == depth]
if not x_at_this_depth:
# reached all leafs
break
assert len(np.unique(x_at_this_depth)) == len(x_at_this_depth)
depth += 1
File diff suppressed because it is too large Load Diff