如何解决“ValueError:empty vocabulary~”？

File "final.py", line 97, in <module> stream.sample() File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 449, in sample self._start(is_async) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 389, in _start self._run() File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 320, in _run six.reraise(*exc_info) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/six.py", line 693, in reraise raise value File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 289, in _run self._read_loop(resp) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 351, in _read_loop self._data(next_status_obj) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 323, in _data if self.listener.on_data(data) is False: File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 54, in on_data if self.on_status(status) is False: File "final.py", line 78, in on_status tfidf = vectorizer.fit_transform(corpus) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1652, in fit_transform X = super().fit_transform(raw_documents) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1058, in fit_transform self.fixed_vocabulary_) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 989, in _count_vocab raise ValueError("empty vocabulary; perhaps the documents only" ValueError: empty vocabulary; perhaps the documents only contain stop words

import os import tweepy import redis import math from collections import Counter import re from natto import MeCab import codecs import sys from sklearn.feature_extraction.text import TfidfVectorizer import glob import numpy as np #r = redis.Redis(host='localhost', port=6379, db=0) TWITTER_CLIENT_ID = os.environ['TWITTER_CLIENT_ID'] TWITTER_CLIENT_SECRET = os.environ['TWITTER_CLIENT_SECRET'] TWITTER_OAUTH_TOKEN = os.environ['TWITTER_OAUTH_TOKEN'] TWITTER_OAUTH_TOKEN_SECRET = os.environ['TWITTER_OAUTH_TOKEN_SECRET'] auth = tweepy.OAuthHandler(TWITTER_CLIENT_ID,TWITTER_CLIENT_SECRET) auth.set_access_token(TWITTER_OAUTH_TOKEN,TWITTER_OAUTH_TOKEN_SECRET) class StreamListener(tweepy.StreamListener): def __init__(self): super().__init__() self.count = 0 # Number of tweets acquired def on_status(self, status): text = str(status.text) text2 = re.sub(r"http\S+", "", text) text3 = re.sub(r"@(\w+) ", "", text2) text4 = re.sub(r"#(\w+)", "", text3) text5 = re.sub(r"RT(\w+)", "", text4) #Unable to erase retweet emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" "]+", flags=re.UNICODE) text6 = emoji_pattern.sub("", text5) #Unable to erase Emoji #Writing Japanese tweets to a file + Displaying the number of tweets if status.lang == "ja": self.count += 1 print(self.count, text6) with open("test37.txt", "a", encoding="utf-8") as f: f.write(text6) with codecs.open("test37.txt", "r", "utf-8") as f: corpus = f.read().split("\n") mecab = MeCab('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') #if tagger.lang == 'ja': rm_list = ["RT","https","co","@","__"] docs = [] for txt in corpus: words = mecab.parse(txt, as_nodes=True) doc = [] #Morphological analysis using MeCab for w in words: if w.feature.split(",")[0] == "名詞": #名詞 = noun if len(w.surface) >= 3: if not any(rm in w.surface for rm in rm_list): doc.append(str(w.surface)) doc = ' '.join(doc) docs.append(doc) corpus = docs #tf-idf calculation vectorizer = TfidfVectorizer(min_df=0.03) tfidf = vectorizer.fit_transform(corpus) #Sort words by score feature_names = np.array(vectorizer.get_feature_names()) for vec in tfidf: index = np.argsort(vec.toarray(), axis=1)[:,::-1] feature_words = feature_names[index] #print(corpus) print(feature_words[:,:10]) def on_error(self, status_code): return False stream = tweepy.Stream(auth=auth, listener=StreamListener()) stream.sample()

0条回答

目前没有回答

错误

代码

附加信息

相关问题更多 >

编程相关推荐

热门问题

热门文章