评估模型时出错:分类指标无法处理混合的二元和连续目标
我在用Python的scikit-learn库评估各种回归模型时遇到了一个问题。我写了一段代码来训练和评估不同的算法,包括线性回归(LinearRegression)、决策树回归(DecisionTreeRegressor)、随机森林回归(RandomForestRegressor)、支持向量回归(SVR)和多层感知器回归(MLPRegressor)。但是,当我试图计算这些回归模型的分类指标,比如准确率、精确率、召回率和F1分数时,出现了以下错误:
ValueError: 分类指标无法处理二元和连续目标的混合
根据我的理解,这个错误是因为我在回归问题上使用了分类指标。然而,我的数据集中只有二元结果变量,所以这不应该是个问题。
这是我代码的一个版本:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn.feature_selection
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
# Find performance of model using preprocessed data
algorithms = {
'LogisticRegression': {
'constructor': LogisticRegression(max_iter=4),
'predict': lambda m, x_test: m.predict(x_test),
'predict_prob': lambda m, x_test: m.predict_proba(x_test)[::, 1],
},
'LinearRegression': {
'constructor': LinearRegression(),
'predict': lambda m, x_test: m.predict(x_test),
'predict_prob': None,
},
'DecisionTreeRegressor': {
'constructor': DecisionTreeRegressor(),
'predict': lambda m, x_test: m.predict(x_test),
'predict_prob': None,
},
'RandomForestRegressor': {
'constructor': RandomForestRegressor(),
'predict': lambda m, x_test: m.predict(x_test),
'predict_prob': None,
},
'SVR': {
'constructor': SVR(),
'predict': lambda m, x_test: m.predict(x_test),
'predict_prob': None,
},
'MLPRegressor': {
'constructor': MLPRegressor(),
'predict': lambda m, x_test: m.predict(x_test),
'predict_prob': None,
},
}
def ds_split(dataset, dependent_var, split):
# Split into features and outcomes
x = dataset.drop(dependent_var, axis=1)
y = dataset.get(dependent_var)
# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=split, random_state=1)
# Select the best features
select = sklearn.feature_selection.SelectKBest(k="all")
selected_features = select.fit(x_train, y_train)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [x.columns[i] for i in indices_selected]
x_train_selected = x_train[colnames_selected]
x_test_selected = x_test[colnames_selected]
# Split data into train and test sets
return x_train_selected, x_test_selected, y_train, y_test
datasets = {
'processed': ds_split(undersampled_df, 'hospital_expire_flag', 0.7)}
models = [
{
'algo': 'LogisticRegression',
'ds': 'processed',
},
{
'algo': 'LinearRegression',
'ds': 'processed',
},
{
'algo': 'DecisionTreeRegressor',
'ds': 'processed',
},
{
'algo': 'RandomForestRegressor',
'ds': 'processed',
},
{
'algo': 'SVR',
'ds': 'processed',
},
{
'algo': 'MLPRegressor',
'ds': 'processed',
},
]
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle # For saving the model to a file
from sklearn import metrics
def evaluate_model(algo, x_train, y_train, x_test, y_test, model_filename):
# Fit the model
model = algo['constructor']
model.fit(x_train, y_train)
#Save model
with open(model_filename, 'wb') as file:
pickle.dump(model, file)
# Predict Test Data
y_pred = algo['predict'](model, x_test)
# Calculate accuracy, precision, recall, f1-score, and kappa score
acc = metrics.accuracy_score(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred)
rec = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
kappa = metrics.cohen_kappa_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
# Calculate area under curve (AUC)
y_pred_proba = algo['predict_prob'](model, x_test)
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
# Display confussion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa,
'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm, 'mse': mse, 'rmse': rmse, 'mae': mae}
for model in models:
x_train, x_test, y_train, y_test = datasets[model['ds']]
# Generate model filename based on algorithm
model_name = model['algo']
model_filename = f"{model_name}_model.pickle"
model['metrics'] = evaluate_model(algorithms[model['algo']], x_train, y_train, x_test, y_test, model_filename)
model_acc = model['metrics']['acc']
model_prec = model['metrics']['prec']
model_rec = model['metrics']['rec']
model_f1 = model['metrics']['f1']
model_kappa = model['metrics']['kappa']
model_auc = model['metrics']['auc']
print(f'Algorithm: {model_name} | accuracy: {model_acc} | precision: {model_prec} | rec: {model_rec} | f1: {model_f1} | kappa: {model_kappa} | auc: {model_auc})
我检查过,所有特征都是二元的:
binary_columns = []
solo = []
non_binary_columns = []
for col in undersampled_df.columns:
if undersampled_df[col].nunique() == 2:
binary_columns.append(col)
if undersampled_df[col].nunique() == 1:
solo.append(col)
if undersampled_df[col].nunique() > 2:
non_binary_columns.append(col)
print("Binary Columns:")
print(binary_columns)
print("\n Non Binary Columns:")
print(non_binary_columns)
print("\n Solo Columns:")
print(solo)
Binary Columns:
['hospital_expire_flag', 'gender_F', 'gender_M', 'age_-65', 'age_+65', 'PAPs', 'PAPd', 'PAPm', 'SvO2 SQI', 'HR_65-89', 'HR_90-99', 'HR_+100', 'ABPs_0-119', 'ABPs_+120', 'ABPd_0-79', 'PO2 (Arterial)_0-74', 'PO2 (Arterial)_75-99', 'PO2 (Arterial)_+100', 'SaO2_0-94', 'SaO2_95-100', 'PCO2 (Arterial)_0-34', 'PCO2 (Arterial)_35-45', 'PCO2 (Arterial)_+45', 'PH (Venous)_0-7.30', 'PH (Venous)_7.31-7.41', 'PH (Venous)_7.41+', 'HCO3 (serum)_0-22', 'HCO3 (serum)_23-29', 'Temperature F_-97', 'Temperature F_97-99', 'Temperature F_+99', 'Creatinine (serum)_-0.74', 'Creatinine (serum)_0.74-1.35', 'Creatinine (serum)_+1.35', 'Total Bilirubin_0.1-1.2', 'Total Bilirubin_+1.2', 'Heart Rhythm_AF (Atrial Fibrillation)', 'Heart Rhythm_SR (Sinus Rhythm)', 'Heart Rhythm_ST (Sinus Tachycardia) ', 'Skin Temp_Cool', 'Skin Temp_Warm']
CNon Binary Columns:
[]
Solo Columns:
[]
有人能告诉我该如何解决这个问题吗?
我不知道该怎么做才能解决这个问题。
1 个回答
问题在于你使用的是回归模型,而不是分类模型。像LinearRegression()、RandomForestRegressor()和SVR()这些函数的predict()会返回浮点数,而不是二进制值(0或1)。这就会导致你看到的错误信息(“ValueError: Classification metrics can't handle a mix of binary and continuous targets”),因为你在用准确率的指标来评估这些浮点数。
对于你提到的每一个回归模型,都有一个对应的分类模型。
下面这个简单的例子可以正常工作:
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
X, y = load_breast_cancer(return_X_y=True)
def fit_and_accuracy(model, X, y):
clf = model.fit(X, y)
y_pred = clf.predict(X)
print(accuracy_score(y, y_pred))
fit_and_accuracy(LogisticRegression(solver='newton-cg'), X, y)
fit_and_accuracy(DecisionTreeClassifier(), X, y)
fit_and_accuracy(SVC(), X, y)
fit_and_accuracy(RandomForestClassifier(), X, y)
fit_and_accuracy(MLPClassifier(), X, y)
所以在你的代码中,你应该使用LogisticRegression、DecisionTreeClassifier、SVC、RandomForestClassifier或者MLPClassifier等分类模型,而不是回归模型。