= est.cost_complexity_pruning_path(X, y)

    pruning_path = info.ccp_alphas
    impurities = info.impurities
    assert np.all(np.diff(pruning_path) >= 0)
    assert np.all(np.diff(impurities) >= 0)

    assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)


@pytest.mark.parametrize("dataset", DATASETS.keys())
@pytest.mark.parametrize("tree_cls", [DecisionTreeRegressor, ExtraTreeRegressor])
def test_prune_tree_regression_are_subtrees(dataset, tree_cls):
    dataset = DATASETS[dataset]
    X, y = dataset["X"], dataset["y"]

    est = tree_cls(max_leaf_nodes=20, random_state=0)
    info = est.cost_complexity_pruning_path(X, y)

    pruning_path = info.ccp_alphas
    impurities = info.impurities
    assert np.all(np.diff(pruning_path) >= 0)
    assert np.all(np.diff(impurities) >= 0)

    assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)


def test_prune_single_node_tree():
    # single node tree
    clf1 = DecisionTreeClassifier(random_state=0)
    clf1.fit([[0], [1]], [0, 0])

    # pruned single node tree
    clf2 = DecisionTreeClassifier(random_state=0, ccp_alpha=10)
    clf2.fit([[0], [1]], [0, 0])

    assert_is_subtree(clf1.tree_, clf2.tree_)


def assert_pruning_creates_subtree(estimator_cls, X, y, pruning_path):
    # generate trees with increasing alphas
    estimators = []
    for ccp_alpha in pruning_path:
        est = estimator_cls(max_leaf_nodes=20, ccp_alpha=ccp_alpha, random_state=0).fit(
            X, y
        )
        estimators.append(est)

    # A pruned tree must be a subtree of the previous tree (which had a
    # smaller ccp_alpha)
    for prev_est, next_est in zip(estimators, estimators[1:]):
        assert_is_subtree(prev_est.tree_, next_est.tree_)


def assert_is_subtree(tree, subtree):
    assert tree.node_count >= subtree.node_count
    assert tree.max_depth >= subtree.max_depth

    tree_c_left = tree.children_left
    tree_c_right = tree.children_right
    subtree_c_left = subtree.children_left
    subtree_c_right = subtree.children_right

    stack = [(0, 0)]
    while stack:
        tree_node_idx, subtree_node_idx = stack.pop()
        assert_array_almost_equal(
            tree.value[tree_node_idx], subtree.value[subtree_node_idx]
        )
        assert_almost_equal(
            tree.impurity[tree_node_idx], subtree.impurity[subtree_node_idx]
        )
        assert_almost_equal(
            tree.n_node_samples[tree_node_idx], subtree.n_node_samples[subtree_node_idx]
        )
        assert_almost_equal(
            tree.weighted_n_node_samples[tree_node_idx],
            subtree.weighted_n_node_samples[subtree_node_idx],
        )

        if subtree_c_left[subtree_node_idx] == subtree_c_right[subtree_node_idx]:
            # is a leaf
            assert_almost_equal(TREE_UNDEFINED, subtree.threshold[subtree_node_idx])
        else:
            # not a leaf
            assert_almost_equal(
                tree.threshold[tree_node_idx], subtree.threshold[subtree_node_idx]
            )
            stack.append((tree_c_left[tree_node_idx], subtree_c_left[subtree_node_idx]))
            stack.append(
                (tree_c_right[tree_node_idx], subtree_c_right[subtree_node_idx])
            )


@pytest.mark.parametrize("name", ALL_TREES)
@pytest.mark.parametrize("splitter", ["best", "random"])
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
def test_apply_path_readonly_all_trees(name, splitter, sparse_container):
    dataset = DATASETS["clf_small"]
    X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False)
    if sparse_container is None:
        X_readonly = create_memmap_backed_data(X_small)
    else:
        X_readonly = sparse_container(dataset["X"])

        X_readonly.data = np.array(X_readonly.data, dtype=tree._tree.DTYPE)
        (
            X_readonly.data,
            X_readonly.indices,
            X_readonly.indptr,
        ) = create_memmap_backed_data(
            (X_readonly.data, X_readonly.indices, X_readonly.indptr)
        )

    y_readonly = create_memmap_backed_data(np.array(y_small, dtype=tree._tree.DTYPE))
    est = ALL_TREES[name](splitter=splitter)
    est.fit(X_readonly, y_readonly)
    assert_array_equal(est.predict(X_readonly), est.predict(X_small))
    assert_array_equal(
        est.decision_path(X_readonly).todense(), est.decision_path(X_small).todense()
    )


@pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse", "poisson"])
@pytest.mark.parametrize("Tree", REG_TREES.values())
def test_balance_property(criterion, Tree):
    # Test that sum(y_pred)=sum(y_true) on training set.
    # This works if the mean is predicted (should even be true for each leaf).
    # MAE predicts the median and is therefore excluded from this test.

    # Choose a training set with non-negative targets (for poisson)
    X, y = diabetes.data, diabetes.target
    reg = Tree(criterion=criterion)
    reg.fit(X, y)
    assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))


@pytest.mark.parametrize("seed", range(3))
def test_poisson_zero_nodes(seed):
    # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes.
    X = [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 2], [1, 2], [1, 3]]
    y = [0, 0, 0, 0, 1, 2, 3, 4]
    # Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can
    # easily learn that:
    reg = DecisionTreeRegressor(criterion="squared_error", random_state=seed)
    reg.fit(X, y)
    assert np.amin(reg.predict(X)) == 0
    # whereas Poisson must predict strictly positive numbers
    reg = DecisionTreeRegressor(criterion="poisson", random_state=seed)
    reg.fit(X, y)
    assert np.all(reg.predict(X) > 0)

    # Test additional dataset where something could go wrong.
    n_features = 10
    X, y = datasets.make_regression(
        effective_rank=n_features * 2 // 3,
        tail_strength=0.6,
        n_samples=1_000,
        n_features=n_features,
        n_informative=n_features * 2 // 3,
        random_state=seed,
    )
    # some excess zeros
    y[(-1 < y) & (y < 0)] = 0
    # make sure the target is positive
    y = np.abs(y)
    reg = DecisionTreeRegressor(criterion="poisson", random_state=seed)
    reg.fit(X, y)
    assert np.all(reg.predict(X) > 0)


def test_poisson_vs_mse():
    # For a Poisson distributed target, Poisson loss should give better results
    # than squared error measured in Poisson deviance as metric.
    # We have a similar test, test_poisson(), in
    # sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
    rng = np.random.RandomState(42)
    n_train, n_test, n_features = 500, 500, 10
    X = datasets.make_low_rank_matrix(
        n_samples=n_train + n_test, n_features=n_features, random_state=rng
    )
    # We create a log-linear Poisson model and downscale coef as it will get
    # exponentiated.
    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
    y = rng.poisson(lam=np.exp(X @ coef))
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=n_test, random_state=rng
    )
    # We prevent some overfitting by setting min_samples_split=10.
    tree_poi = DecisionTreeRegressor(
        criterion="poisson", min_samples_split=10, random_state=rng
    )
    tree_mse = DecisionTreeRegressor(
        criterion="squared_error", min_samples_split=10, random_state=rng
    )

    tree_poi.fit(X_train, y_train)
    tree_mse.fit(X_train, y_train)
    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)

    for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]:
        metric_poi = mean_poisson_deviance(y, tree_poi.predict(X))
        # squared_error might produce non-positive predictions => clip
        metric_mse = mean_poisson_deviance(y, np.clip(tree_mse.predict(X), 1e-15, None))
        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
        # As squared_error might correctly predict 0 in train set, its train
        # score can be better than Poisson. This is no longer the case for the
        # test set.
        if val == "test":
            assert metric_poi < 0.5 * metric_mse
        assert metric_poi < 0.75 * metric_dummy


@pytest.mark.parametrize("Tree", [DecisionTreeClassifier, ExtraTreeClassifier])
@pytest.mark.parametrize("n_classes", [2, 4])
def test_criterion_entropy_same_as_log_loss(Tree, n_classes):
    """Test that criterion=entropy gives same as log_loss."""
    n_samples, n_features = 50, 5
    X, y = datasets.make_classification(
        n_classes=n_classes,
        n_samples=n_samples,
        n_features=n_features,
        n_informative=n_features,
        n_redundant=0,
        random_state=42,
    )
    tree_log_loss = Tree(criterion="log_loss", random_state=43).fit(X, y)
    tree_entropy = Tree(criterion="entropy", random_state=43).fit(X, y)

    assert_tree_equal(
        tree_log_loss.tree_,
        tree_entropy.tree_,
        f"{Tree!r} with criterion 'entropy' and 'log_loss' gave different trees.",
    )
    assert_allclose(tree_log_loss.predict(X), tree_entropy.predict(X))


def test_different_endianness_pickle():
    X, y = datasets.make_classification(random_state=0)

    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
    clf.fit(X, y)
    score = clf.score(X, y)

    def reduce_ndarray(arr):
        return arr.byteswap().view(arr.dtype.newbyteorder()).__reduce__()

    def get_pickle_non_native_endianness():
        f = io.BytesIO()
        p = pickle.Pickler(f)
        p.dispatch_table = copyreg.dispatch_table.copy()
        p.dispatch_table[np.ndarray] = reduce_ndarray

        p.dump(clf)
        f.seek(0)
        return f

    new_clf = pickle.load(get_pickle_non_native_endianness())
    new_score = new_clf.score(X, y)
    assert np.isclose(score, new_score)


def test_different_endianness_joblib_pickle():
    X, y = datasets.make_classification(random_state=0)

    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
    clf.fit(X, y)
    score = clf.score(X, y)

    class NonNativeEndiannessNumpyPickler(NumpyPickler):
        def save(self, obj):
            if isinstance(obj, np.ndarray):
                obj = obj.byteswap().view(obj.dtype.newbyteorder())
            super().save(obj)

    def get_joblib_pickle_non_native_endianness():
        f = io.BytesIO()
        p = NonNativeEndiannessNumpyPickler(f)

        p.dump(clf)
        f.seek(0)
        return f

    new_clf = joblib.load(get_joblib_pickle_non_native_endianness())
    new_score = new_clf.score(X, y)
    assert np.isclose(score, new_score)


def get_different_bitness_node_ndarray(node_ndarray):
    new_dtype_for_indexing_fields = np.int64 if _IS_32BIT else np.int32

    # field names in Node struct with SIZE_t types (see sklearn/tree/_tree.pxd)
    indexing_field_names = ["left_child", "right_child", "feature", "n_node_samples"]

    new_dtype_dict = {
        name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()
    }
    for name in indexing_field_names:
        new_dtype_dict[name] = new_dtype_for_indexing_fields

    new_dtype = np.dtype(
        {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
    )
    return node_ndarray.astype(new_dtype, casting="same_kind")


def get_different_alignment_node_ndarray(node_ndarray):
    new_dtype_dict = {
        name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()
    }
    offsets = [offset for dtype, offset in node_ndarray.dtype.fields.values()]
    shifted_offsets = [8 + offset for offset in offsets]

    new_dtype = np.dtype(
        {
            "names": list(new_dtype_dict.keys()),
            "formats": list(new_dtype_dict.values()),
            "offsets": shifted_offsets,
        }
    )
    return node_ndarray.astype(new_dtype, casting="same_kind")


def reduce_tree_with_different_bitness(tree):
    new_dtype = np.int64 if _IS_32BIT else np.int32
    tree_cls, (n_features, n_classes, n_outputs), state = tree.__reduce__()
    new_n_classes = n_classes.astype(new_dtype, casting="same_kind")

    new_state = state.copy()
    new_state["nodes"] = get_different_bitness_node_ndarray(new_state["nodes"])

    return (tree_cls, (n_features, new_n_classes, n_outputs), new_state)


def test_different_bitness_pickle():
    X, y = datasets.make_classification(random_state=0)

    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
    clf.fit(X, y)
    score = clf.score(X, y)

    def pickle_dump_with_different_bitness():
        f = io.BytesIO()
        p = pickle.Pickler(f)
        p.dispatch_table = copyreg.dispatch_table.copy()
        p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness

        p.dump(clf)
        f.seek(0)
        return f

    new_clf = pickle.load(pickle_dump_with_different_bitness())
    new_score = new_clf.score(X, y)
    assert score == pytest.approx(new_score)


def test_different_bitness_joblib_pickle():
    # Make sure that a platform specific pickle generated on a 64 bit
    # platform can be converted at pickle load time into an estimator
    # with Cython code that works with the host's native integer precision
    # to index nodes in the tree data structure when the host is a 32 bit
    # platform (and vice versa).
    X, y = datasets.make_classification(random_state=0)

    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
    clf.fit(X, y)
    score = clf.score(X, y)

    def joblib_dump_with_different_bitness():
        f = io.BytesIO()
        p = NumpyPickler(f)
        p.dispatch_table = copyreg.dispatch_table.copy()
        p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness

        p.dump(clf)
        f.seek(0)
        return f

    new_clf = joblib.load(joblib_dump_with_different_bitness())
    new_score = new_clf.score(X, y)
    assert score == pytest.approx(new_score)


def test_check_n_classes():
    expected_dtype = np.dtype(np.int32) if _IS_32BIT else np.dtype(np.int64)
    allowed_dtypes = [np.dtype(np.int32), np.dtype(np.int64)]
    allowed_dtypes += [dt.newbyteorder() for dt in allowed_dtypes]

    n_classes = np.array([0, 1], dtype=expected_dtype)
    for dt in allowed_dtypes:
        _check_n_classes(n_classes.astype(dt), expected_dtype)

    with pytest.raises(ValueError, match="Wrong dimensions.+n_classes"):
        wrong_dim_n_classes = np.array([[0, 1]], dtype=expected_dtype)
        _check_n_classes(wrong_dim_n_classes, expected_dtype)

    with pytest.raises(ValueError, match="n_classes.+incompatible dtype"):
        wrong_dtype_n_classes = n_classes.astype(np.float64)
        _check_n_classes(wrong_dtype_n_classes, expected_dtype)


def test_check_value_ndarray():
    expected_dtype = np.dtype(np.float64)
    expected_shape = (5, 1, 2)
    value_ndarray = np.zeros(expected_shape, dtype=expected_dtype)

    allowed_dtypes = [expected_dtype, expected_dtype.newbyteorder()]

    for dt in allowed_dtypes:
        _check_value_ndarray(
            value_ndarray, expected_dtype=dt, expected_shape=expected_shape
        )

    with pytest.raises(ValueError, match="Wrong shape.+value array"):
        _check_value_ndarray(
            value_ndarray, expected_dtype=expected_dtype, expected_shape=(1, 2)
        )

    for problematic_arr in [value_ndarray[:, :, :1], np.asfortranarray(value_ndarray)]:
        with pytest.raises(ValueError, match="value array.+C-contiguous"):
            _check_value_ndarray(
                problematic_arr,
                expected_dtype=expected_dtype,
                expected_shape=problematic_arr.shape,
            )

    with pytest.raises(ValueError, match="value array.+incompatible dtype"):
        _check_value_ndarray(
            value_ndarray.astype(np.float32),
            expected_dtype=expected_dtype,
            expected_shape=expected_shape,
        )


def test_check_node_ndarray():
    expected_dtype = NODE_DTYPE

    node_ndarray = np.zeros((5,), dtype=expected_dtype)

    valid_node_ndarrays = [
        node_ndarray,
        get_different_bitness_node_ndarray(node_ndarray),
        get_different_alignment_node_ndarray(node_ndarray),
    ]
    valid_node_ndarrays += [
        arr.astype(arr.dtype.newbyteorder()) for arr in valid_node_ndarrays
    ]

    for arr in valid_node_ndarrays:
        _check_node_ndarray(node_ndarray, expected_dtype=expected_dtype)

    with pytest.raises(ValueError, match="Wrong dimensions.+node array"):
        problematic_node_ndarray = np.zeros((5, 2), dtype=expected_dtype)
        _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)

    with pytest.raises(ValueError, match="node array.+C-contiguous"):
        problematic_node_ndarray = node_ndarray[::2]
        _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)

    dtype_dict = {name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()}

    # array with wrong 'threshold' field dtype (int64 rather than float64)
    new_dtype_dict = dtype_dict.copy()
    new_dtype_dict["threshold"] = np.int64

    new_dtype = np.dtype(
        {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
    )
    problematic_node_ndarray = node_ndarray.astype(new_dtype)

    with pytest.raises(ValueError, match="node array.+incompatible dtype"):
        _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)

    # array with wrong 'left_child' field dtype (float64 rather than int64 or int32)
    new_dtype_dict = dtype_dict.copy()
    new_dtype_dict["left_child"] = np.float64
    new_dtype = np.dtype(
        {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
    )

    problematic_node_ndarray = node_ndarray.astype(new_dtype)

    with pytest.raises(ValueError, match="node array.+incompatible dtype"):
        _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)


@pytest.mark.parametrize(
    "Splitter", chain(DENSE_SPLITTERS.values(), SPARSE_SPLITTERS.values())
)
def test_splitter_serializable(Splitter):
    """Check that splitters are serializable."""
    rng = np.random.RandomState(42)
    max_features = 10
    n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp)

    criterion = CRITERIA_CLF["gini"](n_outputs, n_classes)
    splitter = Splitter(criterion, max_features, 5, 0.5, rng, monotonic_cst=None)
    splitter_serialize = pickle.dumps(splitter)

    splitter_back = pickle.loads(splitter_serialize)
    assert splitter_back.max_features == max_features
    assert isinstance(splitter_back, Splitter)


def test_tree_deserialization_from_read_only_buffer(tmpdir):
    """Check that Trees can be deserialized with read only buffers.

    Non-regression test for gh-25584.
    """
    pickle_path = str(tmpdir.join("clf.joblib"))
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X_small, y_small)

    joblib.dump(clf, pickle_path)
    loaded_clf = joblib.load(pickle_path, mmap_mode="r")

    assert_tree_equal(
        loaded_clf.tree_,
        clf.tree_,
        "The trees of the original and loaded classifiers are not equal.",
    )


@pytest.mark.parametrize("Tree", ALL_TREES.values())
def test_min_sample_split_1_error(Tree):
    """Check that an error is raised when min_sample_split=1.

    non-regression test for issue gh-25481.
    """
    X = np.array([[0, 0], [1, 1]])
    y = np.array([0, 1])

    # min_samples_split=1.0 is valid
    Tree(min_samples_split=1.0).fit(X, y)

    # min_samples_split=1 is invalid
    tree = Tree(min_samples_split=1)
    msg = (
        r"'min_samples_split' .* must be an int in the range \[2, inf\) "
        r"or a float in the range \(0.0, 1.0\]"
    )
    with pytest.raises(ValueError, match=msg):
        tree.fit(X, y)


@pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"])
def test_missing_values_best_splitter_on_equal_nodes_no_missing(criterion):
    """Check missing values goes to correct node during predictions."""
    X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
    y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])

    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
    dtc.fit(X, y)

    # Goes to right node because it has the most data points
    y_pred = dtc.predict([[np.nan]])
    assert_allclose(y_pred, [np.mean(y[-5:])])

    # equal number of elements in both nodes
    X_equal = X[:-1]
    y_equal = y[:-1]

    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
    dtc.fit(X_equal, y_equal)

    # Goes to right node because the implementation sets:
    # missing_go_to_left = n_left > n_right, which is False
    y_pred = dtc.predict([[np.nan]])
    assert_allclose(y_pred, [np.mean(y_equal[-4:])])


@pytest.mark.parametrize("seed", range(3))
@pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"])
def test_missing_values_random_splitter_on_equal_nodes_no_missing(criterion, seed):
    """Check missing values go to the correct node during predictions for ExtraTree.

    Since ETC use random splits, we use different seeds to verify that the
    left/right node is chosen correctly when the splits occur.
    """
    X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
    y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])

    etr = ExtraTreeRegressor(random_state=seed, max_depth=1, criterion=criterion)
    etr.fit(X, y)

    # Get the left and right children of the root node
    left_child = etr.tree_.children_left[0]
    right_child = etr.tree_.children_right[0]

    # Get the number of samples for the left and right children
    left_samples = etr.tree_.weighted_n_node_samples[left_child]
    right_samples = etr.tree_.weighted_n_node_samples[right_child]
    went_left = left_samples > right_samples

    # predictions
    y_pred_left = etr.tree_.value[left_child][0]
    y_pred_right = etr.tree_.value[right_child][0]

    # Goes to node with the most data points
    y_pred = etr.predict([[np.nan]])
    if went_left:
        assert_allclose(y_pred_left, y_pred)
    else:
        assert_allclose(y_pred_right, y_pred)


@pytest.mark.parametrize("criterion", ["entropy", "gini"])
def test_missing_values_best_splitter_three_classes(criterion):
    """Test when missing values are uniquely present in a class among 3 classes."""
    missing_values_class = 0
    X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 8, 9, 11, 12]]).T
    y = np.array([missing_values_class] * 4 + [1] * 4 + [2] * 4)
    dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion)
    dtc.fit(X, y)

    X_test = np.array([[np.nan, 3, 12]]).T
    y_nan_pred = dtc.predict(X_test)
    # Missing values necessarily are associated to the observed class.
    assert_array_equal(y_nan_pred, [missing_values_class, 1, 2])


@pytest.mark.parametrize("criterion", ["entropy", "gini"])
def test_missing_values_best_splitter_to_left(criterion):
    """Missing values spanning only one class at fit-time must make missing
    values at predict-time be classified has belonging to this class."""
    X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T
    y = np.array([0] * 4 + [1] * 6)

    dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion)
    dtc.fit(X, y)

    X_test = np.array([[np.nan, 5, np.nan]]).T
    y_pred = dtc.predict(X_test)

    assert_array_equal(y_pred, [0, 1, 0])


@pytest.mark.parametrize("criterion", ["entropy", "gini"])
def test_missing_values_best_splitter_to_right(criterion):
    """Missing values and non-missing values sharing one class at fit-time
    must make missing values at predict-time be classified has belonging
    to this class."""
    X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T
    y = np.array([1] * 4 + [0] * 4 + [1] * 2)

    dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion)
    dtc.fit(X, y)

    X_test = np.array([[np.nan, 1.2, 4.8]]).T
    y_pred = dtc.predict(X_test)

    assert_array_equal(y_pred, [1, 0, 1])


@pytest.mark.parametrize("criterion", ["entropy", "gini"])
def test_missing_values_best_splitter_missing_both_classes_has_nan(criterion):
    """Check behavior of missing value when there is one missing value in each class."""
    X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T
    y = np.array([0] * 5 + [1] * 5)

    dtc = DecisionTreeClassifier(random_state=42, max_depth=1, criterion=criterion)
    dtc.fit(X, y)
    X_test = np.array([[np.nan, 2.3, 34.2]]).T
    y_pred = dtc.predict(X_test)

    # Missing value goes to the class at the right (here 1) because the implementation
    # searches right first.
    assert_array_equal(y_pred, [1, 0, 1])


@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
@pytest.mark.parametrize(
    "tree",
    [
        DecisionTreeRegressor(criterion="absolute_error"),
        ExtraTreeRegressor(criterion="absolute_error"),
    ],
)
def test_missing_value_errors(sparse_container, tree):
    """Check unsupported configurations for missing values."""

    X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T
    y = np.array([0] * 5 + [1] * 5)

    if sparse_container is not None:
        X = sparse_container(X)

    with pytest.raises(ValueError, match="Input X contains NaN"):
        tree.fit(X, y)


@pytest.mark.parametrize("Tree", REG_TREES.values())
def test_missing_values_poisson(Tree):
    """Smoke test for poisson regression and missing values."""
    X, y = diabetes.data.copy(), diabetes.target

    # Set some values missing
    X[::5, 0] = np.nan
    X[::6, -1] = np.nan

    reg = Tree(criterion="poisson", random_state=42)
    reg.fit(X, y)

    y_pred = reg.predict(X)
    assert (y_pred >= 0.0).all()


def make_friedman1_classification(*args, **kwargs):
    X, y = datasets.make_friedman1(*args, **kwargs)
    y = y > 14
    return X, y


@pytest.mark.parametrize(
    "make_data, Tree, tolerance",
    [
        # Due to the sine link between X and y, we expect the native handling of
        # missing values to always be better than the naive mean imputation in the
        # regression case.
        #
        # Due to randomness in ExtraTree, we expect the native handling of missing
        # values to be sometimes better than the naive mean imputation, but not always
        (datasets.make_friedman1, DecisionTreeRegressor, 0),
        (datasets.make_friedman1, ExtraTreeRegressor, 0.07),
        (make_friedman1_classification, DecisionTreeClassifier, 0.03),
        (make_friedman1_classification, ExtraTreeClassifier, 0.12),
    ],
)
@pytest.mark.parametrize("sample_weight_train", [None, "ones"])
def test_missing_values_is_resilience(
    make_data, Tree, sample_weight_train, global_random_seed, tolerance
):
    """Check that trees can deal with missing values have decent performance."""
    n_samples, n_features = 5_000, 10
    X, y = make_data(
        n_samples=n_samples,
        n_features=n_features,
        noise=1.0,
        random_state=global_random_seed,
    )

    X_missing = X.copy()
    rng = np.random.RandomState(global_random_seed)
    X_missing[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan
    X_missing_train, X_missing_test, y_train, y_test = train_test_split(
        X_missing, y, random_state=global_random_seed
    )
    if sample_weight_train == "ones":
        sample_weight = np.ones(X_missing_train.shape[0])
    else:
        sample_weight = None

    # max_depth is used to avoid overfitting and also improve the runtime
    # of the test.
    max_depth = 10
    native_tree = Tree(max_depth=max_depth, random_state=global_random_seed)
    native_tree.fit(X_missing_train, y_train, sample_weight=sample_weight)
    score_native_tree = native_tree.score(X_missing_test, y_test)

    tree_with_imputer = make_pipeline(
        SimpleImputer(), Tree(max_depth=max_depth, random_state=global_random_seed)
    )
    tree_with_imputer.fit(X_missing_train, y_train)
    score_tree_with_imputer = tree_with_imputer.score(X_missing_test, y_test)

    assert score_native_tree + tolerance > score_tree_with_imputer, (
        f"{score_native_tree=} + {tolerance} should be strictly greater than"
        f" {score_tree_with_imputer}"
    )


# A single ExtraTree will randomly send missing values down the left, or right child,
# and therefore will not necessarily have the same performance as the greedy
# handling of missing values.
@pytest.mark.parametrize("Tree, expected_score", zip(CLF_TREES.values(), [0.85, 0.53]))
def test_missing_value_is_predictive(Tree, expected_score, global_random_seed):
    """Check the tree learns when only the missing value is predictive."""
    rng = np.random.RandomState(0)
    n_samples = 500

    X = rng.standard_normal(size=(n_samples, 20))
    y = np.concatenate([np.zeros(n_samples // 2), np.ones(n_samples // 2)])
    # y = rng.randint(0, high=2, size=n_samples)

    # Create a predictive feature using `y` and with some noise
    X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05])
    y_mask = y.copy().astype(bool)
    y_mask[X_random_mask] = ~y_mask[X_random_mask]

    X_predictive = rng.standard_normal(size=n_samples)
    X_predictive[y_mask] = np.nan

    X[:, 5] = X_predictive

    tree = Tree(random_state=global_random_seed)

    # Check that the tree can learn the predictive feature
    # over an average of cross-validation fits.
    tree_cv_score = cross_val_score(tree, X, y, cv=5).mean()
    assert (
        tree_cv_score >= expected_score
    ), f"Expected CV score: {expected_score} but got {tree_cv_score}"


@pytest.mark.parametrize(
    "make_data, Tree",
    [
        (datasets.make_regression, DecisionTreeRegressor),
        (datasets.make_classification, DecisionTreeClassifier),
    ],
)
def test_sample_weight_non_uniform(make_data, Tree):
    """Check sample weight is correctly handled with missing values."""
    rng = np.random.RandomState(0)
    n_samples, n_features = 1000, 10
    X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng)

    # Create dataset with missing values
    X[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan

    # Zero sample weight is the same as removing the sample
    sample_weight = np.ones(X.shape[0])
    sample_weight[::2] = 0.0

    tree_with_sw = Tree(random_state=0)
    tree_with_sw.fit(X, y, sample_weight=sample_weight)

    tree_samples_removed = Tree(random_state=0)
    tree_samples_removed.fit(X[1::2, :], y[1::2])

    assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X))


def test_deterministic_pickle():
    # Non-regression test for:
    # https://github.com/scikit-learn/scikit-learn/issues/27268
    # Uninitialised memory would lead to the two pickle strings being different.
    tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target)
    tree2 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target)

    pickle1 = pickle.dumps(tree1)
    pickle2 = pickle.dumps(tree2)

    assert pickle1 == pickle2


@pytest.mark.parametrize("Tree", [DecisionTreeRegressor, ExtraTreeRegressor])
@pytest.mark.parametrize(
    "X",
    [
        # missing values will go left for greedy splits
        np.array([np.nan, 2, np.nan, 4, 5, 6]),
        np.array([np.nan, np.nan, 3, 4, 5, 6]),
        # missing values will go right for greedy splits
        np.array([1, 2, 3, 4, np.nan, np.nan]),
        np.array([1, 2, 3, np.nan, 6, np.nan]),
    ],
)
@pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"])
def test_regression_tree_missing_values_toy(Tree, X, criterion):
    """Check that we properly handle missing values in regression trees using a toy
    dataset.

    The regression targeted by this test was that we were not reinitializing the
    criterion when it comes to the number of missing values. Therefore, the value
    of the critetion (i.e. MSE) was completely wrong.

    This test check that the MSE is null when there is a single sample in the leaf.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/28254
    https://github.com/scikit-learn/scikit-learn/issues/28316
    """
    X = X.reshape(-1, 1)
    y = np.arange(6)

    tree = Tree(criterion=criterion, random_state=0).fit(X, y)
    tree_ref = clone(tree).fit(y.reshape(-1, 1), y)

    impurity = tree.tree_.impurity
    assert all(impurity >= 0), impurity.min()  # MSE should always be positive

    # Check the impurity match after the first split
    assert_allclose(tree.tree_.impurity[:2], tree_ref.tree_.impurity[:2])

    # Find the leaves with a single sample where the MSE should be 0
    leaves_idx = np.flatnonzero(
        (tree.tree_.children_left == -1) & (tree.tree_.n_node_samples == 1)
    )
    assert_allclose(tree.tree_.impurity[leaves_idx], 0.0)


def test_regression_extra_tree_missing_values_toy(global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    n_samples = 100
    X = np.arange(n_samples, dtype=np.float64).reshape(-1, 1)
    X[-20:, :] = np.nan
    rng.shuffle(X)
    y = np.arange(n_samples)

    tree = ExtraTreeRegressor(random_state=global_random_seed, max_depth=5).fit(X, y)

    impurity = tree.tree_.impurity
    assert all(impurity >= 0), impurity  # MSE should always be positive


def test_classification_tree_missing_values_toy():
    """Check that we properly handle missing values in clasification trees using a toy
    dataset.

    The test is more involved because we use a case where we detected a regression
    in a random forest. We therefore define the seed and bootstrap indices to detect
    one of the non-frequent regression.

    Here, we check that the impurity is null or positive in the leaves.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/28254
    """
    X, y = datasets.load_iris(return_X_y=True)

    rng = np.random.RandomState(42)
    X_missing = X.copy()
    mask = rng.binomial(
        n=np.ones(shape=(1, 4), dtype=np.int32), p=X[:, [2]] / 8
    ).astype(bool)
    X_missing[mask] = np.nan
    X_train, _, y_train, _ = train_test_split(X_missing, y, random_state=13)

    # fmt: off
    # no black reformatting for this specific array
    indices = np.array([
        2, 81, 39, 97, 91, 38, 46, 31, 101, 13, 89, 82, 100, 42, 69, 27, 81, 16, 73, 74,
        51, 47, 107, 17, 75, 110, 20, 15, 104, 57, 26, 15, 75, 79, 35, 77, 90, 51, 46,
        13, 94, 91, 23, 8, 93, 93, 73, 77, 12, 13, 74, 109, 110, 24, 10, 23, 104, 27,
        92, 52, 20, 109, 8, 8, 28, 27, 35, 12, 12, 7, 43, 0, 30, 31, 78, 12, 24, 105,
        50, 0, 73, 12, 102, 105, 13, 31, 1, 69, 11, 32, 75, 90, 106, 94, 60, 56, 35, 17,
        62, 85, 81, 39, 80, 16, 63, 6, 80, 84, 3, 3, 76, 78
    ], dtype=np.int32)
    # fmt: on

    tree = DecisionTreeClassifier(
        max_depth=3, max_features="sqrt", random_state=1857819720
    )
    tree.fit(X_train[indices], y_train[indices])
    assert all(tree.tree_.impurity >= 0)

    leaves_idx = np.flatnonzero(
        (tree.tree_.children_left == -1) & (tree.tree_.n_node_samples == 1)
    )
    assert_allclose(tree.tree_.impurity[leaves_idx], 0.0)


def test_build_pruned_tree_py():
    """Test pruning a tree with the Python caller of the Cythonized prune tree."""
    tree = DecisionTreeClassifier(random_state=0, max_depth=1)
    tree.fit(iris.data, iris.target)

    n_classes = np.atleast_1d(tree.n_classes_)
    pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_)

    # only keep the root note
    leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8)
    leave_in_subtree[0] = 1
    _build_pruned_tree_py(pruned_tree, tree.tree_, leave_in_subtree)

    assert tree.tree_.node_count == 3
    assert pruned_tree.node_count == 1
    with pytest.raises(AssertionError):
        assert_array_equal(tree.tree_.value, pruned_tree.value)
    assert_array_equal(tree.tree_.value[0], pruned_tree.value[0])

    # now keep all the leaves
    pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_)
    leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8)
    leave_in_subtree[1:] = 1

    # Prune the tree
    _build_pruned_tree_py(pruned_tree, tree.tree_, leave_in_subtree)
    assert tree.tree_.node_count == 3
    assert pruned_tree.node_count == 3, pruned_tree.node_count
    assert_array_equal(tree.tree_.value, pruned_tree.value)


def test_build_pruned_tree_infinite_loop():
    """Test pruning a tree does not result in an infinite loop."""

    # Create a tree with root and two children
    tree = DecisionTreeClassifier(random_state=0, max_depth=1)
    tree.fit(iris.data, iris.target)
    n_classes = np.atleast_1d(tree.n_classes_)
    pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_)

    # only keeping one child as a leaf results in an improper tree
    leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8)
    leave_in_subtree[1] = 1
    with pytest.raises(
        ValueError, match="Node has reached a leaf in the original tree"
    ):
        _build_pruned_tree_py(pruned_tree, tree.tree_, leave_in_subtree)


def test_sort_log2_build():
    """Non-regression test for gh-30554.

    Using log2 and log in sort correctly sorts feature_values, but the tie breaking is
    different which can results in placing samples in a different order.
    """
    rng = np.random.default_rng(75)
    some = rng.normal(loc=0.0, scale=10.0, size=10).astype(np.float32)
    feature_values = np.concatenate([some] * 5)
    samples = np.arange(50)
    _py_sort(feature_values, samples, 50)
    # fmt: off
    # no black reformatting for this specific array
    expected_samples = [
        0, 40, 30, 20, 10, 29, 39, 19, 49,  9, 45, 15, 35,  5, 25, 11, 31,
        41,  1, 21, 22, 12,  2, 42, 32, 23, 13, 43,  3, 33,  6, 36, 46, 16,
        26,  4, 14, 24, 34, 44, 27, 47,  7, 37, 17,  8, 38, 48, 28, 18
    ]
    # fmt: on
    assert_array_equal(samples, expected_samples)