import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
ngrams = ['coffee', 'darkly', 'darkly colored', 'bitter', 'stimulating',
'drinks', 'stimulating drinks']
new_docs = [
'Coffee is darkly colored, bitter, slightly acidic and \
has a stimulating effect in humans, primarily due to its \
caffeine content.[3] ',
'It is one of the most popular drinks \
in the world,[4] and it can be prepared and presented in a \
variety of ways (e.g., espresso, French press, caffè latte). '
]
# Instantiate CountVectorizer and train it with your ngrams
cv = CountVectorizer(ngram_range=(1, 2))
cv.fit(ngrams)
cv.vocabulary_
# Apply the vectorizer to new documents and display the dense matrix
counts = cv.transform(new_docs)
counts.A
# Turn the results into a data frame
counts_df = pd.DataFrame(counts.A, columns=cv.get_feature_names())
counts_df
您可以通过使用n-gram列表训练CountVectorizer来实现这一点
输出
相关问题 更多 >
编程相关推荐