具有奇异行为的随机森林分类器的随机超参数搜索

CODE: # define the range of hyperparameters n_estimators = [np.int64(x) for x in np.linspace(100, 1000, 10)] max_depth = [np.int64(x) for x in np.linspace(1, 50, 10)] min_samples_split = np.linspace(0.1, 1, 10) min_samples_leaf = np.linspace(0.1, 0.5, 10, endpoint=False) # max_features = [np.int64(x) for x in np.linspace(1, X.shape[1], 10, endpoint=False)] max_features = [np.int64(x) for x in np.linspace(1, 30, 10, endpoint=False)] # create a dictionary with all the hyperparameters hp_dict = { 'n_estimators':n_estimators, 'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf, 'max_features':max_features } # print the differnt range of hyper parameters used for key, value in hp_dict.items(): print('{:<20} : {}'.format(key, value)) OUT: n_estimators : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] max_depth : [1, 6, 11, 17, 22, 28, 33, 39, 44, 50] min_samples_split : [0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ] min_samples_leaf : [0.1 0.14 0.18 0.22 0.26 0.3 0.34 0.38 0.42 0.46] max_features : [1, 3, 6, 9, 12, 15, 18, 21, 24, 27]

CODE: class CustomRandomSearch: def __init__(self, X, y, clf, hp_dict, n_settings=100, cv=3, best_metric='accuracy', random_seed=None): # set the random seed np.random.seed(seed=random_seed) # initalize a DataFrame of the metrics col_names =['Classifier', 'Accuracy', 'Precision', 'Recall', 'fit_time', 'score_time'] metrics_df = pd.DataFrame(columns=col_names[1:], index=np.arange(n_settings)) scoring = {name:metric for name, metric in zip(col_names[1:], ['accuracy', 'precision_macro', 'recall_macro'])} self.data_ = X self.labels_ = y self.settings_cartesian = pd.Series(list(itertools.product(*[value for _, value in hp_dict.items()]))) self.settings_keys_ = [key for key, _ in hp_dict.items()] self.settings_random_ = self.random_settings(self.settings_cartesian, n_settings, random_seed) self.clf_name_ = type(clf).__name__ self.best_metric_ = 0 self.best_settings_ = None self.best_classifier_ = None # START the random search process # use k-fold cross validation to evaluate the baseline performance of different models print('Random Search| Model: {}| {}-Fold Cross Validation\n'.format(self.clf_name_ ,cv)) for n, setting in enumerate(self.settings_random_): setting_dict = {key:value for key, value in zip(self.settings_keys_, setting)} # initalize the classifier using these settings clf.set_params(**setting_dict) cv_results = cross_validate(clf, X, y, scoring=scoring, cv=cv, return_train_score=False) print('Settings: {}'.format(setting_dict)) for result_name, result in cv_results.items(): result_pattern = re.compile('.*(Accuracy|Precision|Recall)') result_name_ = result_pattern.findall(result_name) # find the mean value of the metric for all cross validation folds result_mean = result.mean() # if the result one of the scoring metrics we define, add the result to the metrics_df if result_name_: metrics_df[result_name_[0]][n] = result_mean print('\t{}: {}'.format(result_name_[0], result_mean)) # find the best metric and classifier if result_name_[0].lower() == best_metric: if result_mean > self.best_metric_: self.best_metric_ = result_mean self.best_setting_ = setting_dict self.best_classifier_ = clf else: print('\t{}: {}'.format(result_name, result.mean().round(2))) metrics_df[result_name][n] = result_mean self.metrics_df_ = metrics_df print('='*50) # END the random search process def random_settings(self, settings_cartesian, n_settings, random_seed): '''A function to randomly sample the hyperparameters from n_settings which will be used to build models''' # find random index which whill be used to get the hyperparameters sampled_index = np.random.choice(np.arange(0, len(settings_cartesian)), n_settings, replace=False) return settings_cartesian[sampled_index]

CODE: hp_dict = { 'n_estimators':n_estimators, 'max_depth':max_depth, # 'min_samples_split':min_samples_split, # 'min_samples_leaf':min_samples_leaf, 'max_features':max_features } rsearch = CustomRandomSearch(X, y, RandomForestClassifier(), hp_dict, 5, random_seed=1) rsearch.metrics_df_ OUT: Random Search| Model: RandomForestClassifier| 3-Fold Cross Validation {'n_estimators': 600, 'max_depth': 1, 'max_features': 21} Settings: {'n_estimators': 600, 'max_depth': 1, 'max_features': 21} fit_time: 0.87 score_time: 0.83 Accuracy: 0.5236672225712568 Precision: 0.006341609686704505 Recall: 0.01196739485509539 ================================================== {'n_estimators': 900, 'max_depth': 6, 'max_features': 24} Settings: {'n_estimators': 900, 'max_depth': 6, 'max_features': 24} fit_time: 3.94 score_time: 1.35 Accuracy: 0.5409239034777237 Precision: 0.024122440405242623 Recall: 0.018964838417488486 ================================================== {'n_estimators': 500, 'max_depth': 28, 'max_features': 6} Settings: {'n_estimators': 500, 'max_depth': 28, 'max_features': 6} fit_time: 2.82 score_time: 0.99 Accuracy: 0.5643549612162997 Precision: 0.09548924947833685 Recall: 0.08020801637285205 ================================================== {'n_estimators': 400, 'max_depth': 33, 'max_features': 24} Settings: {'n_estimators': 400, 'max_depth': 33, 'max_features': 24} fit_time: 4.36 score_time: 0.76 Accuracy: 0.5645611295019107 Precision: 0.10931388963435491 Recall: 0.09537162949195259 ================================================== {'n_estimators': 300, 'max_depth': 22, 'max_features': 6} Settings: {'n_estimators': 300, 'max_depth': 22, 'max_features': 6} fit_time: 1.59 score_time: 0.61 Accuracy: 0.5598910737210692 Precision: 0.10028259405181572 Recall: 0.0844580862573192 ================================================== Accuracy Precision Recall fit_time score_time 0 0.523667 0.00634161 0.0119674 0.836494 0.810416 1 0.540924 0.0241224 0.0189648 3.8934 1.38452 2 0.564355 0.0954892 0.080208 2.61455 0.97889 3 0.564561 0.109314 0.0953716 4.24907 0.762426 4 0.559891 0.100283 0.0844581 1.54386 0.589665

CODE: hp_dict = { 'n_estimators':n_estimators, 'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf, 'max_features':max_features } rsearch = CustomRandomSearch(X, y, RandomForestClassifier(), hp_dict, 5, random_seed=1) rsearch.metrics_df_ OUT: Random Search| Model: RandomForestClassifier| 3-Fold Cross Validation {'n_estimators': 500, 'max_depth': 17, 'min_samples_split': 0.7000000000000001, 'min_samples_leaf': 0.33999999999999997, 'max_features': 1} Settings: {'n_estimators': 500, 'max_depth': 17, 'min_samples_split': 0.7000000000000001, 'min_samples_leaf': 0.33999999999999997, 'max_features': 1} fit_time: 0.51 score_time: 0.71 Accuracy: 0.5236672225712568 Precision: 0.006341609686704505 Recall: 0.01196739485509539 ================================================== {'n_estimators': 900, 'max_depth': 39, 'min_samples_split': 0.30000000000000004, 'min_samples_leaf': 0.38, 'max_features': 24} Settings: {'n_estimators': 900, 'max_depth': 39, 'min_samples_split': 0.30000000000000004, 'min_samples_leaf': 0.38, 'max_features': 24} fit_time: 0.86 score_time: 1.25 Accuracy: 0.5236672225712568 Precision: 0.006341609686704505 Recall: 0.01196739485509539 ================================================== {'n_estimators': 200, 'max_depth': 22, 'min_samples_split': 0.4, 'min_samples_leaf': 0.14, 'max_features': 21} Settings: {'n_estimators': 200, 'max_depth': 22, 'min_samples_split': 0.4, 'min_samples_leaf': 0.14, 'max_features': 21} fit_time: 0.3 score_time: 0.28 Accuracy: 0.5236672225712568 Precision: 0.006341609686704505 Recall: 0.01196739485509539 ================================================== {'n_estimators': 900, 'max_depth': 6, 'min_samples_split': 1.0, 'min_samples_leaf': 0.22, 'max_features': 6} Settings: {'n_estimators': 900, 'max_depth': 6, 'min_samples_split': 1.0, 'min_samples_leaf': 0.22, 'max_features': 6} fit_time: 0.83 score_time: 1.21 Accuracy: 0.5236672225712568 Precision: 0.006341609686704505 Recall: 0.01196739485509539 ================================================== {'n_estimators': 1000, 'max_depth': 28, 'min_samples_split': 0.4, 'min_samples_leaf': 0.18, 'max_features': 3} Settings: {'n_estimators': 1000, 'max_depth': 28, 'min_samples_split': 0.4, 'min_samples_leaf': 0.18, 'max_features': 3} fit_time: 1.05 score_time: 1.35 Accuracy: 0.5236672225712568 Precision: 0.006341609686704505 Recall: 0.01196739485509539 ================================================== Accuracy Precision Recall fit_time score_time 0 0.523667 0.00634161 0.0119674 0.506414 0.714159 1 0.523667 0.00634161 0.0119674 0.860566 1.25096 2 0.523667 0.00634161 0.0119674 0.302913 0.28406 3 0.523667 0.00634161 0.0119674 0.834416 1.20777 4 0.523667 0.00634161 0.0119674 1.04618 1.35358

0条回答

目前没有回答

相关问题更多 >

编程相关推荐

热门问题

热门文章