只有一个类异常

File "C:\Users\jakub\anaconda3\envs\SVM_ensembles\lib\site-packages\sklearn\svm\_base.py", line 250, in _dense_fit self.probB_, self.fit_status_ = libsvm.fit( File "sklearn\svm\_libsvm.pyx", line 191, in sklearn.svm._libsvm.fit ValueError: Invalid input - all samples with positive weights have the same label.

def get_best_ensemble_params(X_train, y_train, X_test, y_test, n_tries=5): search_spaces = { "max_samples": Real(0.1, 1, "uniform"), "max_features": Real(0.1, 1, "uniform"), "kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]), "C": Real(1e-6, 1e+6, "log-uniform"), "gamma": Real(1e-6, 1e+1, "log-uniform") } best_accuracy = 0 best_model = None for i in range(n_tries): done = False while not done: try: optimizer = BayesSearchCV(SVMEnsemble(), search_spaces, cv=3, n_iter=10, n_jobs=-1, n_points=10, verbose=1) optimizer.fit(X_train, y_train) # <- ERROR HERE accuracy = accuracy_score(y_test, optimizer) if accuracy > best_accuracy: best_accuracy = accuracy best_model = optimizer done = True print(i, "job done") except: pass return best_model.best_params_ if __name__ == "__main__": dataset_name = "acute_inflammations" loading_functions = { "acute_inflammations": load_acute_inflammations, "breast_cancer_coimbra": load_breast_cancer_coimbra, "breast_cancer_wisconsin": load_breast_cancer_wisconsin } X, y = loading_functions[dataset_name]() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) params = get_best_ensemble_params(X_train, y_train, X_test, y_test) params["n_jobs"] = -1 params["random_state"] = 0 model = SVMEnsemble(n_estimators=20, **params) model.fit(X_train, y_train) y_pred = model.predict(X_test) print("Accuracy:", accuracy_score(y_test, y_pred))

import inspect import numpy as np from sklearn.ensemble import BaggingClassifier from sklearn.svm import SVC from skopt import BayesSearchCV svm_possible_args = {"C", "kernel", "degree", "gamma", "coef0", "shrinking", "probability", "tol", "cache_size", "class_weight", "max_iter", "decision_function_shape", "break_ties"} bagging_possible_args = {"n_estimators", "max_samples", "max_features", "bootstrap", "bootstrap_features", "oob_score", "warm_start", "n_jobs"} common_possible_args = {"random_state", "verbose"} class SVMEnsemble(BaggingClassifier): def __init__(self, voting_method="hard", n_jobs=-1, n_estimators=10, max_samples=1.0, max_features=1.0, C=1.0, kernel="linear", gamma="scale", **kwargs): if voting_method not in {"hard", "soft"}: raise ValueError(f"voting_method {voting_method} is not recognized.") self._voting_method = voting_method self._C = C self._gamma = gamma self._kernel = kernel passed_args = { "n_jobs": n_jobs, "n_estimators": n_estimators, "max_samples": max_samples, "max_features": max_features, "C": C, "gamma": gamma, "cache_size": 1024, } kwargs.update(passed_args) svm_args = { "probability": True if voting_method == "soft" else False, "kernel": kernel } bagging_args = dict() for arg_name, arg_val in kwargs.items(): if arg_name in svm_possible_args: svm_args[arg_name] = arg_val elif arg_name in bagging_possible_args: bagging_args[arg_name] = arg_val elif arg_name in common_possible_args: svm_args[arg_name] = arg_val bagging_args[arg_name] = arg_val else: raise ValueError(f"argument {voting_method} is not recognized.") self.svm_args = svm_args self.bagging_args = bagging_args base_estimator = SVC(**svm_args) super().__init__(base_estimator=base_estimator, **bagging_args) @property def voting_method(self): return self._voting_method @voting_method.setter def voting_method(self, new_voting_method): if new_voting_method == "soft": self._voting_method = new_voting_method self.svm_args["probability"] = True base_estimator = SVC(**self.svm_args) super().__init__(base_estimator=base_estimator, **self.bagging_args) elif self._voting_method == "soft": self._voting_method = new_voting_method self.svm_args["probability"] = False base_estimator = SVC(**self.svm_args) super().__init__(base_estimator=base_estimator, **self.bagging_args) else: self._voting_method = new_voting_method @property def C(self): return self._C @C.setter def C(self, new_C): self._C = new_C self.svm_args["C"] = new_C base_estimator = SVC(**self.svm_args) super().__init__(base_estimator=base_estimator, **self.bagging_args) @property def gamma(self): return self._gamma @gamma.setter def gamma(self, new_gamma): self._gamma = new_gamma self.svm_args["gamma"] = new_gamma base_estimator = SVC(**self.svm_args) super().__init__(base_estimator=base_estimator, **self.bagging_args) @property def kernel(self): return self._kernel @kernel.setter def kernel(self, new_kernel): self._kernel = new_kernel self.svm_args["kernel"] = new_kernel base_estimator = SVC(**self.svm_args) super().__init__(base_estimator=base_estimator, **self.bagging_args) def predict(self, X): if self._voting_method == "hard": return super().predict(X) elif self._voting_method == "soft": probabilities = np.zeros((X.shape[0], self.classes_.shape[0])) for estimator in self.estimators_: estimator_probabilities = estimator.predict_proba(X) probabilities += estimator_probabilities return self.classes_[probabilities.argmax(axis=1)] else: raise ValueError(f"voting_method {self._voting_method} is not recognized.")

1条回答

网友

1楼 · 发布于 2024-06-16 14:41:02

从您描述问题的方式（您得到的是“非常随机的”）以及对数据和代码的描述来看，我几乎可以肯定问题在于bagging分类器偶尔随机选择一个类的训练示例子样本。K-fold分层拆分在这里对您没有帮助，因为它只会控制数据到训练/测试中的原始拆分，而不会控制BaggingClassifier如何从训练集中选择max_samples的随机子样本。如果你看一下code of how BaggingClassifier picks a subsample，你会发现没有针对这种问题的保护

一个非常简单的确定方法是用一些较小的数字替换"max_samples": Real(0.1, 1, "uniform")，例如"max_samples": Real(0.02, 0.03, "uniform")（或设置为某个固定的较小值），并检查您是否开始更频繁地收到错误

我不确定您是否真的将它用于n_tries=5和n_iter=10（对于您拥有的所有超参数来说似乎有点小），或者使用更大的值和/或可能使用不同的随机种子多次运行整个脚本，但在任何情况下，让我们计算一下max_samples=0.1出现此类问题的概率并且拥有一个包含120个示例的数据集，其分割率为55%/45%。假设您有96个45/55分割的训练集示例，例如53+43个示例。现在，启用引导功能后，每次训练一个打包分类器时，它都会随机挑选，比如说96个样本中的10个（由于默认情况下启用了引导功能，所以会进行替换）。从较大的班级中挑选所有学生的机会为（53/96）^10，即大约0.26%。这意味着，如果你像这样连续训练50个分类器，其中一个出现问题的几率现在是12.5%。如果你继续运行这样的搜索，你几乎不可避免地会遇到这个问题（为了简单起见，我在这里使用了max_samples=0.1，这是不正确的，但是你很可能经常接近这个值）

最后一个问题是如何处理这个问题。有几个可能的答案：

忽略它-您在随机搜索过程中随机获得它，并且所有其他未遇到此类问题的尝试都没有问题。此外，您还可以捕获ValueError异常，如果错误消息来自SVM，则只对当前用于训练的类进行抱怨-跳过此类搜索迭代
增加搜索中max_samples的最小值，或使其取决于示例数

还有其他可能性-例如，在训练/测试中分割数据后，您可以通过将每个样本替换为N相同的样本（其中N是例如2或10）来人为地膨胀训练数据，以减少bagging分类器仅随机选取一个类的子样本的机会

相关问题更多 >

编程相关推荐

热门问题

热门文章