无法理解SciKit Learn中的错误

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import NMF, LatentDirichletAllocation import numpy as np def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents): for topic_idx, topic in enumerate(H): print ("Topic %d:" % (topic_idx)) print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])) top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents] for doc_index in top_doc_indices: print (documents[doc_index]) import pandas as pd from pandas import DataFrame ReadCsv = pd.read_csv (r'C:\Users\yelp_review10K.csv') documents = DataFrame(ReadCsv,columns=['business_id','date','review_id','stars','text','type','user_id']) no_features = 1000 # NMF is able to use tf-idf tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english') tfidf = tfidf_vectorizer.fit_transform(documents['text']) tfidf_feature_names = tfidf_vectorizer.get_feature_names() # LDA can only use raw term counts for LDA because it is a probabilistic graphical model tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english') tf = tf_vectorizer.fit_transform(documents['text']) tf_feature_names = tf_vectorizer.get_feature_names() no_topics = 5 # Run NMF nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf) nmf_W = nmf_model.transform(tfidf) nmf_H = nmf_model.components_ # Run LDA lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf) lda_W = lda_model.transform(tf) lda_H = lda_model.components_ no_top_words = 5 no_top_documents = 2 display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents) display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)

0条回答

目前没有回答

相关问题更多 >

编程相关推荐

热门问题

热门文章