如何在KNN-python sklearn中进行N交叉验证？

import pandas from time import time from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import MinMaxScaler from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score #TRAINING col_names = ["duration","protocol_type","service","flag","src_bytes", "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins", "logged_in","num_compromised","root_shell","su_attempted","num_root", "num_file_creations","num_shells","num_access_files","num_outbound_cmds", "is_host_login","is_guest_login","count","srv_count","serror_rate", "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate", "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"] kdd_data_10percent = pandas.read_csv("data/kdd_10pc", header=None, names = col_names) num_features = [ "duration","src_bytes", "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins", "logged_in","num_compromised","root_shell","su_attempted","num_root", "num_file_creations","num_shells","num_access_files","num_outbound_cmds", "is_host_login","is_guest_login","count","srv_count","serror_rate", "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate", "dst_host_rerror_rate","dst_host_srv_rerror_rate" ] features = kdd_data_10percent[num_features].astype(float) #classifying all labels not "normal" as attack labels = kdd_data_10percent['label'].copy() labels[labels!='normal.'] = 'attack.' print labels.value_counts() #TODO: Normalising of data #TODO: Principal Component Analysis - Data reduction clf = KNeighborsClassifier(n_neighbors = 5, algorithm = 'ball_tree', leaf_size=500) t0 = time() clf.fit(features,labels) tt = time()-t0 print "Classifier trained in {} seconds".format(round(tt,3)) #TESTING kdd_data_test = pandas.read_csv("data/corrected", header=None, names = col_names) kdd_data_test['label'][kdd_data_test['label']!='normal.'] = 'attack.' kdd_data_test[num_features] = kdd_data_test[num_features].astype(float) features_train, features_test, labels_train, labels_test = train_test_split( kdd_data_test[num_features], kdd_data_test['label'], test_size=0.1, random_state=42) t0 = time() pred = clf.predict(features_test) tt = time() - t0 print "Predicted in {} seconds".format(round(tt,3)) acc = accuracy_score(pred, labels_test) print "R squared is {}.".format(round(acc,4))

1条回答

网友

1楼 · 发布于 2024-04-25 05:12:26

K-fold cross validation

import numpy as np
from sklearn.model_selection import KFold

X = ["a", "b", "c", "d"]
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    print("%s %s" % (train, test))

[2 3] [0 1] // these are indices of X
[0 1] [2 3]

Leave One Out cross validation

from sklearn.model_selection import LeaveOneOut

X = [1, 2, 3, 4]
loo = LeaveOneOut()
for train, test in loo.split(X):
    print("%s %s" % (train, test))

[1 2 3] [0] // these are indices of X
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]

Leave P-out Cross Validation

from sklearn.model_selection import LeavePOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
lpo = LeavePOut(2)

for train_index, test_index in lpo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 1] TEST: [2 3]

相关问题更多 >

编程相关推荐

热门问题

热门文章