data.drop('Movie Title', axis=1, inplace=True)
features = data.loc[:, data.columns != 'worldwide_gross_usd']
charges = data['worldwide_gross_usd']
X_train, X_test, y_train, y_test = train_test_split(features,
charges,
random_state=42,
test_size = 0.2)
regr = LinearRegression().fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Trained R-squared score: ', regr.score(X_train, y_train))
print('Tested R-squared score: ', regr.score(X_test, y_test))
输出:
Trained R-squared score: 0.5404764241697003
Tested R-squaredscore: 0.5845801856343114
X_train, X_test, y_train, y_test = train_test_split(features,
charges,
random_state=12,
test_size = 0.2)
regr = LinearRegression().fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Trained R-squared score: ', regr.score(X_train, y_train))
print('Tested R-squared score: ', regr.score(X_test, y_test))
输出:
Trained R-squared score: 0.5345435646372121
Tested R-squaredscore: 0.602138324770633
正如您所注意到的,当我更改random_state
值时,我的训练分数下降了1%,但我的测试分数增加了2%
你喜欢第一个还是第二个R平方分数
R平方得分是回归模型的快速估计值,但不是一个好的估计值。
推论:
如果所有数据点都具有同等相关性,那么测试集上的R平方分数越高越好
如果您不确定数据集的相关性,那么您应该检查其他参数/方法,以找到哪个R平方分数更好
其他参数/方法:
您应该为这两种情况绘制剩余图。检查哪一个平均值接近零,方差接近1(对于大多数数据集),哪个更好。如果任何一种情况下的残差图都有某种模式,那么这种情况就不好,可以改进。如果任何情况下的残差图中有残差,则该情况也不好,可以改进
相关问题 更多 >
编程相关推荐