擅长:python、mysql、java
<p>这是一种更好的方法:请注意,TfidfVectorizer有一个标记器方法,它接受清理后的单词数组。
我想这可能对你有用</p>
<pre><code>from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
nltk.download(['stopwords'])
# here you can add to stopword_list any other word that you want or define your own array_like stopwords_list
stop_words = stopwords.words('english')
def preprocessing(line):
line = re.sub(r"[^a-zA-Z]", " ", line.lower())
words = word_tokenize(line)
words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in words if w not in stop_words]
return words_lemmed
tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocessing)
tfidf = tfidf_vectorizer.fit_transform(df['Texts'])
kmeans = KMeans(n_clusters=2).fit(tfidf)
</code></pre>