如何解决“ValueError:empty vocabulary~”?

2024-04-29 20:23:26 发布

您现在位置:Python中文网/ 问答频道 /正文

在对实时获取的tweet进行形态学分析后,尝试对tf-idf得分为0.03或更高的名词进行排序时出现了此错误。 而且,我不能删除我收到的推文中的转发和表情符号

你能告诉我代码里面发生了什么以及如何修复它吗

错误

  File "final.py", line 97, in <module>
    stream.sample()
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 449, in sample
    self._start(is_async)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 389, in _start
    self._run()
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 320, in _run
    six.reraise(*exc_info)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/six.py", line 693, in reraise
    raise value
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 289, in _run
    self._read_loop(resp)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 351, in _read_loop
    self._data(next_status_obj)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 323, in _data
    if self.listener.on_data(data) is False:
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 54, in on_data
    if self.on_status(status) is False:
  File "final.py", line 78, in on_status
    tfidf = vectorizer.fit_transform(corpus)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1652, in fit_transform
    X = super().fit_transform(raw_documents)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1058, in fit_transform
    self.fixed_vocabulary_)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 989, in _count_vocab
    raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words

代码

import os
import tweepy
import redis
import math
from collections import Counter
import re
from natto import MeCab
import codecs
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
import numpy as np

#r = redis.Redis(host='localhost', port=6379, db=0)

TWITTER_CLIENT_ID = os.environ['TWITTER_CLIENT_ID']
TWITTER_CLIENT_SECRET = os.environ['TWITTER_CLIENT_SECRET']

TWITTER_OAUTH_TOKEN = os.environ['TWITTER_OAUTH_TOKEN']
TWITTER_OAUTH_TOKEN_SECRET = os.environ['TWITTER_OAUTH_TOKEN_SECRET']

auth = tweepy.OAuthHandler(TWITTER_CLIENT_ID,TWITTER_CLIENT_SECRET)
auth.set_access_token(TWITTER_OAUTH_TOKEN,TWITTER_OAUTH_TOKEN_SECRET)

class StreamListener(tweepy.StreamListener):
    def __init__(self):
        super().__init__()
        self.count = 0 # Number of tweets acquired

    def on_status(self, status):
        text = str(status.text)
        text2 = re.sub(r"http\S+", "", text)
        text3 = re.sub(r"@(\w+) ", "", text2)
        text4 = re.sub(r"#(\w+)", "", text3)
        text5 = re.sub(r"RT(\w+)", "", text4) #Unable to erase retweet
        emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        "]+", flags=re.UNICODE)
        text6 = emoji_pattern.sub("", text5) #Unable to erase Emoji

        #Writing Japanese tweets to a file + Displaying the number of tweets
        if status.lang == "ja":
            self.count += 1
            print(self.count, text6)
            with open("test37.txt", "a", encoding="utf-8") as f:
                f.write(text6)
            with codecs.open("test37.txt", "r", "utf-8") as f:
                corpus = f.read().split("\n")

            mecab = MeCab('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')

            #if tagger.lang == 'ja':

            rm_list = ["RT","https","co","@","__"]

            docs = []
            for txt in corpus:
                words = mecab.parse(txt, as_nodes=True)
                doc = []
        #Morphological analysis using MeCab
                for w in words:
                    if w.feature.split(",")[0] == "名詞": #名詞 = noun
                        if len(w.surface) >= 3:
                            if not any(rm in w.surface for rm in rm_list):
                                doc.append(str(w.surface))

                doc = ' '.join(doc)
                docs.append(doc)
            corpus = docs

        #tf-idf calculation
            vectorizer = TfidfVectorizer(min_df=0.03)
            tfidf = vectorizer.fit_transform(corpus)

            #Sort words by score
            feature_names = np.array(vectorizer.get_feature_names())
            for vec in tfidf:
                index = np.argsort(vec.toarray(), axis=1)[:,::-1]
                feature_words = feature_names[index]
                #print(corpus)
                print(feature_words[:,:10])

    def on_error(self, status_code):
        return False

stream = tweepy.Stream(auth=auth, listener=StreamListener())
stream.sample()

附加信息

iOS 10.12.6、Python 3.7.3、Atom


Tags: inpyimportselfliblinesitetwitter