scikitlearn的plot_partial_dependence（）错误地引发了正确拟合模型的NotFittedError（例如Keras回归器或LGBMClassifier）

from sklearn.datasets import load_boston from sklearn.inspection import plot_partial_dependence, partial_dependence from keras.wrappers.scikit_learn import KerasRegressor import keras import tensorflow as tf import pandas as pd boston = load_boston() feature_names = boston.feature_names X = pd.DataFrame(boston.data, columns=boston.feature_names) y = boston.target mean = X.describe().transpose()['mean'] std = X.describe().transpose()['std'] X_norm = (X-mean)/std def build_model_small(): model = keras.Sequential([ keras.layers.Dense(64, activation='relu', input_shape=[len(X.keys())]), keras.layers.Dense(64, activation='relu'), keras.layers.Dense(1) ]) optimizer = keras.optimizers.RMSprop(0.0005) model.compile(loss='mse', optimizer=optimizer, metrics=['mae', 'mse', 'mape']) return model kr = KerasRegressor(build_fn=build_model_small,verbose=0) kr.fit(X_norm,y, epochs=100, validation_split = 0.2) pdp_plot = plot_partial_dependence(kr,X_norm,feature_names)

Traceback (most recent call last): File "temp_ML_tf_sklearn_postproc.py", line 79, in <module> pdp_plot = plot_partial_dependence(kr,X,labels[:-1]) File "/home/mymachine/anaconda3/lib/python3.7/site-packages/sklearn/inspection/_partial_dependence.py", line 678, in plot_partial_dependence for fxs in features) File "/home/mymachine/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 921, in __call__ if self.dispatch_one_batch(iterator): File "/home/mymachine/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch self._dispatch(tasks) File "/home/mymachine/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 716, in _dispatch job = self._backend.apply_async(batch, callback=cb) File "/home/mymachine/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 182, in apply_async result = ImmediateResult(func) File "/home/mymachine/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 549, in __init__ self.results = batch() File "/home/mymachine/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__ for func, args, kwargs in self.items] File "/home/mymachine/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp> for func, args, kwargs in self.items] File "/home/mymachine/anaconda3/lib/python3.7/site-packages/sklearn/inspection/_partial_dependence.py", line 307, in partial_dependence "'estimator' must be a fitted regressor or classifier." ValueError: 'estimator' must be a fitted regressor or classifier.

Traceback (most recent call last): File "temp_ML_tf_sklearn_postproc.py", line 79, in <module> pdp_plot = plot_partial_dependence(kr,X,labels[:-1]) File "/home/billy/anaconda3/lib/python3.7/site-packages/sklearn/inspection/_partial_dependence.py", line 678, in plot_partial_dependence for fxs in features) File "/home/billy/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 921, in __call__ if self.dispatch_one_batch(iterator): File "/home/billy/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch self._dispatch(tasks) File "/home/billy/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 716, in _dispatch job = self._backend.apply_async(batch, callback=cb) File "/home/billy/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 182, in apply_async result = ImmediateResult(func) File "/home/billy/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 549, in __init__ self.results = batch() File "/home/billy/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in __call__ for func, args, kwargs in self.items] File "/home/billy/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 225, in <listcomp> for func, args, kwargs in self.items] File "/home/billy/anaconda3/lib/python3.7/site-packages/sklearn/inspection/_partial_dependence.py", line 317, in partial_dependence check_is_fitted(est) File "/home/billy/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 967, in check_is_fitted raise NotFittedError(msg % {'name': type(estimator).__name__}) sklearn.exceptions.NotFittedError: This KerasRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

2条回答

网友

1楼 · 编辑于 2024-05-15 13:15:59

我最终找到了一个便宜的工作，它成功地适用于这个特定的案例。然而，这不是一个非常令人满意的答案，我也不能保证它适用于所有情况，所以如果有人有一个更一般的答案，我希望看到一个更好的答案。但我会把这个贴在这里，以防其他人需要解决这个问题

我只是简单地将源代码（在我的anaconda安装中，它位于~/anaconda3/lib/python3.7/site-packages/sklearn/inspection/_partial_dependence.py）复制到我的项目目录中名为custom_pdp.py的文件中，在该文件中，我将有问题的部分注释为I（必要时，硬编码我自己的替代值）

在我的代码中，我使用了导入行import custom_pdp as cpdp，而不是从sklearn导入它，然后将plot_partial_dependence称为cpdp.plot_partial_dependence(...)

下面是我必须从源文件更改的行。请注意，您需要复制整个源文件，因为其中定义了其他需要的函数，但我只做了如下更改。另外，这是通过sklearn 0.22.1完成的-它可能不适用于其他版本

首先，必须更改顶部的相对导入行，如下所示：

from sklearn.utils.extmath import cartesian
from sklearn.utils import check_array
from sklearn.utils import check_matplotlib_support  # noqa
from sklearn.utils import _safe_indexing
from sklearn.utils import _determine_key_type
from sklearn.utils import _get_column_indices
from sklearn.utils.validation import check_is_fitted
from sklearn.tree._tree import DTYPE
from sklearn.exceptions import NotFittedError
from sklearn.ensemble._gb import BaseGradientBoosting
from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import (
    BaseHistGradientBoosting)

（它们以前是相对路径，如from ..utils.extmath import cartesian等）

然后，仅更改了以下功能：

从_partial_dependence_brute

def _partial_dependence_brute(est, grid, features, X, response_method):

    ... (skipping docstring)

    averaged_predictions = []

    # define the prediction_method (predict, predict_proba, decision_function).
    # if is_regressor(est):
    #     prediction_method = est.predict
    # else:
    #     predict_proba = getattr(est, 'predict_proba', None)
    #     decision_function = getattr(est, 'decision_function', None)
    #     if response_method == 'auto':
    #         # try predict_proba, then decision_function if it doesn't exist
    #         prediction_method = predict_proba or decision_function
    #     else:
    #         prediction_method = (predict_proba if response_method ==
    #                              'predict_proba' else decision_function)
    #     if prediction_method is None:
    #         if response_method == 'auto':
    #             raise ValueError(
    #                 'The estimator has no predict_proba and no '
    #                 'decision_function method.'
    #             )
    #         elif response_method == 'predict_proba':
    #             raise ValueError('The estimator has no predict_proba method.')
    #         else:
    #             raise ValueError(
    #                 'The estimator has no decision_function method.')
    prediction_method = est.predict

    #the rest in this function are as they were before, beginning with:
    for new_values in grid:
        X_eval = X.copy()

        ....

然后注释掉partial_dependence定义的前20行：

def partial_dependence(estimator, X, features, response_method='auto',
                   percentiles=(0.05, 0.95), grid_resolution=100,
                   method='auto'):
    ... (skipping docstring)
    # if not (is_classifier(estimator) or is_regressor(estimator)):
    #     raise ValueError(
    #         "'estimator' must be a fitted regressor or classifier."
    #     )
    # 
    # if isinstance(estimator, Pipeline):
    #     # TODO: to be removed if/when pipeline get a `steps_` attributes
    #     # assuming Pipeline is the only estimator that does not store a new
    #     # attribute
    #     for est in estimator:
    #         # FIXME: remove the None option when it will be deprecated
    #         if est not in (None, 'drop'):
    #             check_is_fitted(est)
    # else:
    #     check_is_fitted(estimator)
    # 
    # if (is_classifier(estimator) and
    #         isinstance(estimator.classes_[0], np.ndarray)):
    #     raise ValueError(
    #         'Multiclass-multioutput estimators are not supported'
    #     )

    #The rest of the function continues as it was:
    # Use check_array only on lists and other non-array-likes / sparse. Do not
    # convert DataFrame into a NumPy array.
    if not(hasattr(X, '__array__') or sparse.issparse(X)):
        X = check_array(X, force_all_finite='allow-nan', dtype=np.object)

        ....

如果您的模型属于不同的类别或使用不同的参数，则可能需要进行其他更改

在我的模型上，它完全符合我的期望。但就像我说的，这是一个变通办法，不是最令人满意的解决方案。此外，根据您尝试使用的模型或参数的类型，您的成功可能会有很大差异

网友

2楼 · 编辑于 2024-05-15 13:15:59

出现此问题的原因是，非scikit学习模型对象（如LightGBMRegressor或LGBMClassifier）不包含以下划线结尾的属性，而check_is_fitted()将下划线用作模型拟合时的测试（请参见docs）

因此，一个简单的解决方法是在经过训练的模型对象中添加一个名称以下划线结尾的虚拟属性：

test_model.dummy_ = "dummy"

您还可以通过自己调用check_if_fitted()来验证它是否有效：

from sklearn.utils import validation

validation.check_is_fitted(estimator=test_model)

相关问题更多 >

编程相关推荐

热门问题

热门文章