将数据拆分为培训和测试

trainy = [0 for _ in range(900)] + [1 for _ in range(900)] save_dataset([trainX,trainy], 'train.pkl') testY = [0 for _ in range(100)] + [1 for _ in range(100)] save_dataset([testX,testY], 'test.pkl')

# load all docs in a directory def process_docs(directory, is_trian): documents = list() # walk through all files in the folder for filename in listdir(directory): # skip any transcript in the test set

if is_trian and filename.startswith('----'): continue if not is_trian and not filename.startswith('----'): continue # create the full path of the file to open path = directory + '/' + filename # load the doc doc = load_doc(path) # clean doc tokens = clean_doc(doc) # add to list documents.append(tokens) return documents # save a dataset to file def save_dataset(dataset, filename): dump(dataset, open(filename, 'wb')) print('Saved: %s' % filename) # load all training transcripts healthy_docs = process_docs('PathToData/healthy', True) sick_docs = process_docs('PathToData/sick', True) trainX = healthy_docs + sick_docs trainy = [0 for _ in range(len( healthy_docs ))] + [1 for _ in range(len( sick_docs ))] save_dataset([trainX,trainy], 'train.pkl') # load all test transcripts healthy_docs = process_docs('PathToData/healthy', False) sick_docs = process_docs('PathToData/sick', False) testX = healthy_docs + sick_docs testY = [0 for _ in range(len( healthy_docs ))] + [1 for _ in range(len( sick_docs ))] save_dataset([testX,testY], 'test.pkl')

2条回答

网友

1楼 · 编辑于 2024-06-02 05:03:20

您应该发布更多的代码，但听起来您的问题是如何管理数据。假设您在一个名为“health”的文件夹中有240个文件，在一个名为“sick”的文件夹中有240个文件。然后您需要用标签0标记所有健康人，用标签1标记所有病人。尝试以下操作：

from glob import glob 
from sklearn.model_selection import train_test_split

#get the filenames for healthy people 
xhealthy = [ fname for fname in glob( 'pathToData/healthy/*' )]

#give healthy people label of 0
yhealthy = [ 0 for i in range( len( xhealthy ))]

#get the filenames of sick people
xsick    = [ fname for fname in glob( 'pathToData/sick/*')]

#give sick people label of 1
ysick    = [ 1 for i in range( len( xsick ))]

#combine the data 
xdata = xhealthy + xsick 
ydata = yhealthy + ysick 

#create the training and test set 
X_train, X_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.1)

然后用X\u-train，Y\u-train训练你的模型，用X\u-test，Y\u-test测试它-记住你的X\u数据只是需要处理的文件名。你发布的代码越多，就有越多的人可以帮助你解决问题。你知道吗

网友

2楼 · 编辑于 2024-06-02 05:03:20

我能够通过手动将数据集分离为训练集和测试集，然后单独标记每个集来解决这个问题。我目前的数据集太小了，所以一旦我有能力，我会继续为大型数据集寻找更好的解决方案。提供结束问题。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章