Python sklearn值错误：词汇表为空

from __future__ import division import nltk, textmining, pprint, re, os.path #import numpy as np from nltk.corpus import gutenberg import fileinput list = ["carmilla.txt", "pirate-caribbee.txt", "rider-sage.txt"] for l in list: f = open(l) raw1 = f.read() print "<-----Here goes nothing" head = raw1[:680] foot = raw1[157560:176380] content = raw1[680:157560] print "Done---->" content=[re.sub(r'[\']', '', text)for text in content] content=[re.sub(r'[^\w\s\.]', ' ', text) for text in content] print content propernouns = [] for story in content: propernouns = propernouns+re.findall(r'Mr.[\s][\w]+', story) propernouns = propernouns+re.findall(r'Mrs.[\s][\w]+', story) propernouns = propernouns+re.findall(r'Ms.[\s][\w]+', story) propernouns = propernouns+re.findall(r'Miss.[\s][\w]+', story) propernouns = set(propernouns) print "\nNumber of proper nouns: " + str(len(propernouns)) print "\nExamples from our list of proper nouns: "+str(sorted(propernouns)) #Strip all of the above out of text for word in propernouns: content = [re.sub(" "+word+" "," ",story) for story in content] import string content = [story.translate(string.maketrans("",""), "_.0123456789")] print "\n[2] -----Carmilla Text-----" print content #Prepare a list of stopwords f1 = open('stopwords.txt', 'r') f2 = open('stopwords2.txt', 'w') for line in f1: f2.write(line.replace('\n', ' ')) f1.close() f2.close() stopfile = open('stopwords2.txt') print "Examples of stopwords: " print stopfile.read() from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(stop_words = stopfile , min_df=1) stories_tdm = cv.fit_transform(content).toarray()

1条回答

网友

1楼 · 发布于 2024-04-26 04:31:27

请记住正确关闭文件。f.close()无处，f2.close()不应缩进，f1.close()

我想这可以解决你的问题。在

for l in list:
    f = open(l)
    raw1 = f.read()
    print "<  -Here goes nothing"
    head = raw1[:680]
    foot = raw1[157560:176380]
    content = raw1[680:157560]
    print "Done  >"
    f.close()

。。。在

^{2}$

编辑我又看到了两个问题：

一个是：内容=[故事.翻译(字符串.maketrans（“，”“），”0123456789“）]

此缩进级别不存在story变量，因此请澄清这一点。在

另一个问题是stop_words可能是string、alist或{}。在string的情况下，唯一支持的值是'english'。但是，在您的情况下，您将传递一个文件句柄：

stopfile = open('stopwords2.txt')
#...
cv = CountVectorizer(stop_words = stopfile , min_df=1)

您应该做的是将stopfile中的所有文本转换为字符串列表。替换此项：

#Prepare a list of stopwords
f1 = open('stopwords.txt', 'r')
f2 = open('stopwords2.txt', 'w')
for line in f1:
    f2.write(line.replace('\n', ' '))
    f1.close()
    f2.close()

stopfile = open('stopwords2.txt')

print "Examples of stopwords: "
print stopfile.read()

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = stopfile , min_df=1)

有了这个：

#Prepare a list of stopwords
f1 = open('stopwords.txt', 'r')
stoplist = []
for line in f1:
    nextlist = line.replace('\n', ' ').split()
    stoplist.extend(nextlist)
f1.close()

print "Examples of stopwords: "
print stoplist


from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = stoplist, min_df=1)

相关问题更多 >

编程相关推荐

热门问题

热门文章