Scikit-Learn 的 ValueError
链接到 car data.csv
我的代码:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
car_data = pd.read_csv('car_data.csv')
# Create X
X = car_data.drop('Buy Rate', axis=1)
# Create Y
y = car_data['Buy Rate']
clf = RandomForestClassifier()
clf.get_params()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
在 clf.fit
这一行之后,出现了这个错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_51905/2395142735.py in ?()
----> 1 clf.fit(X_train, y_train)
~/Desktop/ml-course/env/lib/python3.10/site-packages/sklearn/base.py in ?(estimator, *args, **kwargs)
1147 skip_parameter_validation=(
1148 prefer_skip_nested_validation or global_skip_validation
1149 )
1150 ):
-> 1151 return fit_method(estimator, *args, **kwargs)
~/Desktop/ml-course/env/lib/python3.10/site-packages/sklearn/ensemble/_forest.py in ?(self, X, y, sample_weight)
344 """
345 # Validate or convert input data
346 if issparse(y):
347 raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 348 X, y = self._validate_data(
349 X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
350 )
351 if sample_weight is not None:
~/Desktop/ml-course/env/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
617 if "estimator" not in check_y_params:
618 check_y_params = {**default_check_params, **check_y_params}
619 y = check_array(y, input_name="y", **check_y_params)
620 else:
--> 621 X, y = check_X_y(X, y, **check_params)
622 out = X, y
623
624 if not no_val_X and check_params.get("ensure_2d", True):
~/Desktop/ml-course/env/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1143 raise ValueError(
1144 f"{estimator_name} requires y to be passed, but the target y is None"
1145 )
1146
-> 1147 X = check_array(
1148 X,
1149 accept_sparse=accept_sparse,
1150 accept_large_sparse=accept_large_sparse,
~/Desktop/ml-course/env/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
914 )
915 array = xp.astype(array, dtype, copy=False)
916 else:
917 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
--> 918 except ComplexWarning as complex_warning:
919 raise ValueError(
920 "Complex data not supported\n{}\n".format(array)
921 ) from complex_warning
~/Desktop/ml-course/env/lib/python3.10/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp)
376 # Use NumPy API to support order
377 if copy is True:
378 array = numpy.array(array, order=order, dtype=dtype)
379 else:
--> 380 array = numpy.asarray(array, order=order, dtype=dtype)
381
382 # At this point array is a NumPy ndarray. We convert it to an array
383 # container that is consistent with the input's namespace.
~/Desktop/ml-course/env/lib/python3.10/site-packages/pandas/core/generic.py in ?(self, dtype)
2082 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
2083 values = self._values
-> 2084 arr = np.asarray(values, dtype=dtype)
2085 if (
2086 astype_is_view(values.dtype, arr.dtype)
2087 and using_copy_on_write()
ValueError: could not convert string to float: 'Hyundai'
我看过这里面类似的问题,但没有一个能帮到我。
2 个回答
0
你传给 .fit
的 X
变量必须是一个矩阵,里面的元素要能转换成 np.float32
这种格式。
这段话来自于 sklearn.ensemble.RandomForestClassifier 的文档。
fit(X, y, sample_weight=None)
X: {类似数组,稀疏矩阵},形状为 (样本数量, 特征数量)
这是训练用的输入样本。内部会把它的类型转换成 dtype=np.float32。如果提供的是稀疏矩阵,它会被转换成稀疏的 csc_matrix。
你可能是在训练数据中传入了字符串。模型只能用数字数据进行训练,所以所有的文本数据在训练之前都必须转换成数字数据。要做到这一点,你需要对字符串(文本数据)进行编码。有很多种方法可以实现,每种方法都有优缺点。你可以在文档中了解更多关于这个主题和不同选项的信息:
3
这个错误出现的原因是你在X中使用的特征(比如品牌和颜色)是分类的。如果你用标签编码器把它们转换成数字变量,
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
car_data['Make'] = le.fit_transform(car_data['Make'])
car_data['Color'] = le.fit_transform(car_data['Color'])
那么当你执行car_data.head(2)
时,结果会像这样,
Make Year Price Mileage Color Buy Rate
0 30 2018 20000 50000 1 0.80
1 13 2019 25000 40000 4 0.70
这样就能解决你的问题!
但是,由于你的目标变量(也就是购买率)是连续的,所以在用随机森林分类器训练数据时会出错,
所以为了进行分类,首先你需要把目标变量进行分箱处理。
num_bins = 3
bin_boundaries = [0, 0.5, 0.75, 1]
car_data['Buy Rate'] = pd.cut(car_data['Buy Rate'], bins=num_bins, labels=False)
car_data['Buy Rate'] = car_data['Buy Rate'].map({0: 'Low', 1: 'Medium', 2: 'High'})
结果,
0 Medium
1 Medium
2 Low
.
.
.
32 High
在分箱之后,你就可以用随机森林分类器来训练你的数据了。
import pandas as pd
from sklearn.model_selection import train_test_split
car_data = pd.read_csv('car_data.csv')
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
car_data['Make'] = le.fit_transform(car_data['Make'])
car_data['Color'] = le.fit_transform(car_data['Color'])
num_bins = 3
bin_boundaries = [0, 0.5, 0.75, 1]
car_data['Buy Rate'] = pd.cut(car_data['Buy Rate'], bins=num_bins, labels=False)
car_data['Buy Rate'] = car_data['Buy Rate'].map({0: 'Low', 1: 'Medium', 2: 'High'})
# Create X
X = car_data.drop('Buy Rate', axis=1)
# Create Y
y = car_data['Buy Rate'] # target variable
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.get_params()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
最后,这就是你应该如何修改代码。