我试图使用函数read txt file并通过单词进行标记化,包括标记化、删除空格、词干分析、收集字数、删除停止字,但是词干分析有问题,因为一些“s”和“r”被程序吞没了。 另外,哪个部分适合插入单词计数
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize #split variable into words
from nltk.corpus import stopwords #stopwords
from nltk.stem import PorterStemmer #stem tools
from collections import defaultdict
#1)
def tokenizers(filename):
#Read files
file = open(filename, "r", encoding = "utf")
lines = file.readline()
file.close()
#Set stop words and symbols (define symbols)
stopWords = set(stopwords.words("english"))
stopWords = stopWords.union(',','(",")','[","]','{","}','#','@','!',':',';','.','?')
#Tokenize paragrah into words
sentences = word_tokenize(lines)
#Stem words, remove "s"
ps = PorterStemmer()
filterWords = [ps.stem(w) for w in sentences if not w in stopWords]
return filterWords
目前没有回答
相关问题 更多 >
编程相关推荐