随机森林“完美”混淆矩阵

# Separate X_train and Y_train X = ratings_prepared[:, :-1] y= ratings_prepared[:,-1] ################################################################################## # Separate test and train (stratified, 20% test) import numpy as np from sklearn.model_selection import StratifiedKFold from sklearn.base import clone skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for train_index, test_index in skfolds.split(X,y): X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index]

def plot_roc_curve(fpr, tpr, label=None): plt.plot(fpr, tpr, linewidth=2, label =label) plt.plot([0,1], [0,1],'k--') plt.axis([0,1,0,1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): plt.plot(thresholds, precisions[:-1],"b--", label="Precision") plt.plot(thresholds, recalls[:-1], "g-", label="Recall") plt.xlabel("Threashold") plt.legend(loc="center left") plt.ylim([0,1]) ############################# Train Models ############################# from sklearn.linear_model import SGDClassifier sgd_clf =SGDClassifier(random_state=42) sgd_clf.fit(X_train,y_train) y_pred = sgd_clf.predict(X_train) # f1 score f1_score(y_train, y_pred) # confusion matrix tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel() (tn, fp, fn, tp) from sklearn.metrics import plot_confusion_matrix import matplotlib.pyplot as plt disp = plot_confusion_matrix(sgd_clf, X_train, y_train, cmap=plt.cm.Blues, normalize='true') # recall and precision from sklearn.metrics import precision_score, recall_score precision_score(y_train, y_pred) # Precision Recall from sklearn.metrics import precision_recall_curve plot_precision_recall_vs_threshold(precisions, recalls, thresholds) plt.show() # Plot ROC curve y_scores = cross_val_predict(sgd_clf, X_train, y_train, cv=3, method="decision_function") fpr, tpr, thresholds = roc_curve(y_train, y_scores) plot_roc_curve(fpr, tpr) plt.show() # recall and precision from sklearn.metrics import precision_score, recall_score precision_score(y_train, y_pred) ### Precision score: 0.5084427767354597

from sklearn.ensemble import RandomForestClassifier forest_clf = RandomForestClassifier(random_state=42) y_probas_forest = cross_val_predict(forest_clf, X_train, y_train, cv=3, method='predict_proba') y_scores_forest = y_probas_forest[:,1] fpr_forest, tpr_forest, threshold_forest = roc_curve(y_train,y_scores_forest) plt.plot(fpr, tpr, "b:", label="SGD") plot_roc_curve(fpr_forest, tpr_forest, "Random Forest") plt.legend(loc="lower right") plt.show()

from sklearn.ensemble import RandomForestClassifier forest_clf = RandomForestClassifier(random_state=42) y_probas_forest = cross_val_predict(forest_clf, X_train, y_train, cv=3, method='predict_proba') y_scores_forest = y_probas_forest[:,1] fpr_forest, tpr_forest, threshold_forest = roc_curve(y_train,y_scores_forest) forest_clf.fit(X_train,y_train) y_pred = forest_clf.predict(X_train) # f1 score f1_score(y_train, y_pred) # confusion matrix from sklearn.metrics import plot_confusion_matrix import matplotlib.pyplot as plt disp = plot_confusion_matrix(forest_clf, X_train, y_train, cmap=plt.cm.Blues, normalize='true')

1条回答

网友

1楼 · 发布于 2024-06-16 09:06:50

您获得满分的原因是您没有对测试数据进行度量

在第一段中，您将对训练和测试数据进行80/20分割，但是所有度量ROC、混淆矩阵等都是在原始训练数据上进行的，而不是在测试数据上

有了这样的设置，你的报告会显示你疯狂地过度装修

您应该做的是将经过培训的模型应用于测试数据，并查看该模型的工作方式

相关问题更多 >

编程相关推荐

热门问题

热门文章