多类分类：中多个列的SMOTE过采样

tfidf_words = TfidfVectorizer(sublinear_tf=True, min_df=0, norm='l2', encoding='latin-1', ngram_range=(1,1), stop_words='english') x_words = tfidf_words.fit_transform(city_country.preprocessed).toarray() # new dataframe 'label' that contains the event_id for each preprocessed tweet y = city_country.event_id x_train_words, x_test_words, y_train, y_test = train_test_split(x_words, y, test_size = 0.25, random_state = 0) # Use SMOTE to oversample the minority classes from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=12) x_train_words_sm, y_train_words_sm = sm.fit_sample(x_train_words, y_train) # Count the number of occurences in the y_train sets to ensure that the oversampling worked from collections import Counter class_check_woSMOTE = Counter(y_train) class_check_words = Counter(y_train_words_sm)

1条回答

网友

1楼 · 发布于 2024-04-26 00:05:57

突然袭击imblearn.over_采样稀疏向量可以接受为U。你可以进行过采样，然后分成你的测试/训练集。在

如果我正确地理解了你的问题，以下几点对我很有用

请尝试以下操作：

from sklearn.feature_extraction.text import Tfidfvectorizer
from imblearn.over_sampling import SMOTE

strings = city_country.preprocessed

def create_vec(strings):

    tf = TfidfVectorizer(analyzer = 'char_wb',ngram_range=(2,3))
    tf.fit(strings)
    X = tf.transform(strings)

    return X

vecs = create_vec(strings)

y = city_country.event_id

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

然后可以根据输出进行拆分

相关问题更多 >

编程相关推荐

热门问题

热门文章