我已经成功地执行了主题模型代码,但是我希望每个主题都有唯一的wordclouds

2024-04-20 02:24:32 发布

您现在位置:Python中文网/ 问答频道 /正文

# I am importing all the packages necessary for topic modelling in this step
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import gensim
from gensim import corpora, models
import os
from os import path
from time import sleep
import matplotlib.pyplot as plt
import random
import wordcloud
from wordcloud import WordCloud, STOPWORDS
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(get_stop_words('en'))
with open(os.path.join('D:\Users\kaila\jobdescription.txt')) as f:
    Reader = f.read()
# I am replacing all the unnecessary words for wordcloud analysis
Reader = Reader.replace("will", " ")
Reader = Reader.replace("experience", " ")
Reader = Reader.replace("work", " ")
Reader = Reader.replace("years", " ")
Reader = Reader.replace("please", " ")
Reader = Reader.replace("apply", " ")


texts = unicode(Reader, errors='replace')
tdm = []

raw = texts.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
tdm.append(stopped_tokens)

dictionary = corpora.Dictionary(tdm)
corpus = [dictionary.doc2bow(i) for i in tdm]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word =   dictionary)
for t in range(ldamodel.num_topics):
    print(ldamodel)
    print(ldamodel.print_topics(num_topics=5, num_words=8))
    plt.figure()
    plt.imshow(WordCloud().fit_words(ldamodel.show_topic(t, 200)))
    plt.axis("off")
    plt.title("Topic #" + str(t))
    plt.show()

通过这样做,我得到了主题模型和wordclouds,但是结果非常相似。我能做些什么来获得独特的主题模型和wordcloud结果吗?你知道吗


Tags: infromimportforpltnumreplacereader