从文件夹中读取txt文件时,某些文件返回NaN,而实际文件不是空的。 无法找出这些文件被忽略的原因
import pandas as pd
import os
def get_transcripts(file_path):
try:
with open(file_path, 'r') as t:
text = t.read()
return text
except:
print(file_path)
# parent directory path
path = 'D:\\Urja\\Data_Analytics_project\\NPR_Podcasts\\NPR_Podcasts\\'
# dataframe to load the transcripts
df = pd.DataFrame()
for podcast in os.listdir(path)[0:20]: # gets transcripts for the first podcast
podcast_name = podcast
id = 1
for episode in os.listdir(path + podcast + '/'):
episode_name = episode.replace('.txt', '')
transcript = get_transcripts(path + podcast + '/' + episode)
temp = pd.DataFrame.from_dict({'id': [id],
'podcast_name': [podcast_name],
'episode_name': [episode_name],
'transcript': [transcript]},
orient='columns')
df = pd.concat([df, temp], axis=0).reset_index(drop=True)
id += 1
df
目前没有回答
相关问题 更多 >
编程相关推荐