# I am importing all the packages necessary for topic modelling in this step
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import gensim
from gensim import corpora, models
import os
from os import path
from time import sleep
import matplotlib.pyplot as plt
import random
import wordcloud
from wordcloud import WordCloud, STOPWORDS
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(get_stop_words('en'))
with open(os.path.join('D:\Users\kaila\jobdescription.txt')) as f:
Reader = f.read()
# I am replacing all the unnecessary words for wordcloud analysis
Reader = Reader.replace("will", " ")
Reader = Reader.replace("experience", " ")
Reader = Reader.replace("work", " ")
Reader = Reader.replace("years", " ")
Reader = Reader.replace("please", " ")
Reader = Reader.replace("apply", " ")
texts = unicode(Reader, errors='replace')
tdm = []
raw = texts.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
tdm.append(stopped_tokens)
dictionary = corpora.Dictionary(tdm)
corpus = [dictionary.doc2bow(i) for i in tdm]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary)
for t in range(ldamodel.num_topics):
print(ldamodel)
print(ldamodel.print_topics(num_topics=5, num_words=8))
plt.figure()
plt.imshow(WordCloud().fit_words(ldamodel.show_topic(t, 200)))
plt.axis("off")
plt.title("Topic #" + str(t))
plt.show()
通过这样做,我得到了主题模型和wordclouds,但是结果非常相似。我能做些什么来获得独特的主题模型和wordcloud结果吗?你知道吗
目前没有回答
相关问题 更多 >
编程相关推荐