我想使用LightGBM和Sklearn来完成一个多分类任务,代码如下
#Input
X = copy.deepcopy(GTEx_feature)
y = copy.deepcopy(out)
#TrainingSet : TestSet = 4 : 1
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = SEED)
########################################
########## Model Construction ##########
print("\n...... Training Model ......\n")
# LightGBM params
param_dict = {
"objective":'multiclass',
"num_class":[5],
"learning_rate":[0.1, 0.05, 0.02, 0.015, 0.01],#
"num_leaves": range(10,36,5),# Maximum tree leaves for base learners.
"max_depth" : [-1,2,3,4,5,10,20,40,50],#Maximum tree depth for base learners, <=0 means no limit.
"min_data_in_leaf": range(1, 45, 2),#
"feature_fraction" : [i / 10 for i in range(2,11)],#
"metric" : "multi_error",#
"early_stopping_rounds" : [None],#
"n_jobs" : [-1],#
"silent" : [True],#
"verbose" : [-1],#
"n_estimators" : range(50,1000,50),#
"bagging_fraction" : [i / 10 for i in range(2, 11)],#
"bagging_freq" : [0, 1, 2],#
"lambda_l1" : [0, 0.001, 0.005, 0.01, 0.1],#
"lambda_l2" : [0, 0.001, 0.005, 0.01, 0.1],#
"random_state":[2020]#
}
#Initiate model
model = lgb.LGBMClassifier()
#Adjust hyper-parameters with 5-fold cross validation
rscv = RandomizedSearchCV(model,#
param_dict,#
n_iter=100,# Number of parameter settings that are sampled. n_iter tradesoff runtime vs quality of the solution.
cv = 5,# Determines the cross-validation splitting strategy.
verbose = 0,# Controls the verbosity: the higher, the more messages.
scoring = "roc_auc",#
n_jobs =-1#
)
gbm=rscv.fit(X_train, y_train)
我在最后一行中出错:
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-110-97f7932a1488> in <module>
42 n_jobs =-1#
43 )
---> 44 gbm=rscv.fit(X_train, y_train)
45 ########## Model Evaluation ##########
46 print("\n...... Evaluating Model ......\n")
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1527 def _run_search(self, evaluate_candidates):
1528 """Search n_iter candidates from param_distributions"""
-> 1529 evaluate_candidates(ParameterSampler(
1530 self.param_distributions, self.n_iter,
1531 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
698
699 def evaluate_candidates(candidate_params):
--> 700 candidate_params = list(candidate_params)
701 n_candidates = len(candidate_params)
702
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in __iter__(self)
282 % (grid_size, self.n_iter, grid_size), UserWarning)
283 n_iter = grid_size
--> 284 for i in sample_without_replacement(grid_size, n_iter,
285 random_state=rng):
286 yield param_grid[i]
sklearn\utils\_random.pyx in sklearn.utils._random.sample_without_replacement()
OverflowError: Python int too large to convert to C long
因此,我想知道为什么会发生这种错误,我可以尝试如何处理它
这是一行X
g 1.057671
c 2.644094
fi -0.302407
ph -0.772771
da -0.449314
phy -0.447774
gen -1.042650
gs 0.053665
ts -0.197370
sig -0.137325
endna -0.255032
eireig -0.372373
ee3 -0.159200
cong 0.000000
fhft 0.000000
sple 0.000000
p 0.000000
ts 0.000000
kb 0.000000
non 1.000000
sn 1.000000
nss -0.014290
Name: 1, dtype: float64
我在使用相同的X来完成二进制分类时没有遇到这样的错误, y如下所示
0 2
1 0
2 0
3 4
4 4
..
4995 0
4996 1
4997 1
4998 0
4999 0
Name: mtle, Length: 5000, dtype: int64
目前没有回答
相关问题 更多 >
编程相关推荐