Python - sklearn - 值错误:空词汇表
我正在尝试模仿一个之前做过的项目,但在使用CountVectorizer这个函数时遇到了麻烦。下面是与这个问题相关的代码。
from __future__ import division
import nltk, textmining, pprint, re, os.path
#import numpy as np
from nltk.corpus import gutenberg
import fileinput
list = ["carmilla.txt", "pirate-caribbee.txt", "rider-sage.txt"]
for l in list:
f = open(l)
raw1 = f.read()
print "<-----Here goes nothing"
head = raw1[:680]
foot = raw1[157560:176380]
content = raw1[680:157560]
print "Done---->"
content=[re.sub(r'[\']', '', text)for text in content]
content=[re.sub(r'[^\w\s\.]', ' ', text) for text in content]
print content
propernouns = []
for story in content:
propernouns = propernouns+re.findall(r'Mr.[\s][\w]+', story)
propernouns = propernouns+re.findall(r'Mrs.[\s][\w]+', story)
propernouns = propernouns+re.findall(r'Ms.[\s][\w]+', story)
propernouns = propernouns+re.findall(r'Miss.[\s][\w]+', story)
propernouns = set(propernouns)
print "\nNumber of proper nouns: " + str(len(propernouns))
print "\nExamples from our list of proper nouns: "+str(sorted(propernouns))
#Strip all of the above out of text
for word in propernouns:
content = [re.sub(" "+word+" "," ",story) for story in content]
import string
content = [story.translate(string.maketrans("",""), "_.0123456789")]
print "\n[2] -----Carmilla Text-----"
print content
#Prepare a list of stopwords
f1 = open('stopwords.txt', 'r')
f2 = open('stopwords2.txt', 'w')
for line in f1:
f2.write(line.replace('\n', ' '))
f1.close()
f2.close()
stopfile = open('stopwords2.txt')
print "Examples of stopwords: "
print stopfile.read()
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = stopfile , min_df=1)
stories_tdm = cv.fit_transform(content).toarray()
执行这段代码时没有完成,并且出现了这些错误:
Traceback (most recent call last):
File "C:\Users\mnate_000\workspace\de.vogella.python.third\src\TestFile_EDIT.py", line 84, in <module>
stories_tdm = cv.fit_transform(content).toarray()
File "C:\Users\mnate_000\Anaconda\lib\site-packages\sklearn\feature_extraction\text.py", line 780, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary)
File "C:\Users\mnate_000\Anaconda\lib\site-packages\sklearn\feature_extraction\text.py", line 727, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
**ValueError: empty vocabulary; perhaps the documents only contain stop words**
我不知道该怎么做,因为我尝试用另一个文件替换“content”来测试,但它提示我没有使用停用词文件。我似乎无法让它正常运行。有没有人遇到过这个问题?我是不是漏掉了什么简单的东西?
1 个回答
0
请记得正确关闭你的文件。f.close()
这个地方没有,f2.close()
不应该缩进,f1.close()
也是如此。
我觉得这样可能会解决你的问题。
for l in list:
f = open(l)
raw1 = f.read()
print "<-----Here goes nothing"
head = raw1[:680]
foot = raw1[157560:176380]
content = raw1[680:157560]
print "Done---->"
f.close()
...
#Prepare a list of stopwords
f1 = open('stopwords.txt', 'r')
f2 = open('stopwords2.txt', 'w')
for line in f1:
f2.write(line.replace('\n', ' '))
f1.close()
f2.close()
编辑
我看到还有两个问题:
一个是这个: content = [story.translate(string.maketrans("",""), "_.0123456789")]
在这个缩进级别下,没有 story
这个变量,所以请澄清一下。
另一个问题是 stop_words
可能是一个 string
、一个 list
或者 None
。如果是 string
,唯一支持的值是 'english'
。但是在你的情况下,你传入的是一个文件句柄:
stopfile = open('stopwords2.txt')
#...
cv = CountVectorizer(stop_words = stopfile , min_df=1)
你应该把 stopfile
中的所有文本变成一个字符串列表。
把这个:
#Prepare a list of stopwords
f1 = open('stopwords.txt', 'r')
f2 = open('stopwords2.txt', 'w')
for line in f1:
f2.write(line.replace('\n', ' '))
f1.close()
f2.close()
stopfile = open('stopwords2.txt')
print "Examples of stopwords: "
print stopfile.read()
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = stopfile , min_df=1)
换成这个:
#Prepare a list of stopwords
f1 = open('stopwords.txt', 'r')
stoplist = []
for line in f1:
nextlist = line.replace('\n', ' ').split()
stoplist.extend(nextlist)
f1.close()
print "Examples of stopwords: "
print stoplist
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = stoplist, min_df=1)