利用支持向量机进行多标签标注。为什么有些标签的f1分数较低?

2024-06-09 03:23:39 发布

您现在位置:Python中文网/ 问答频道 /正文

我创建了一个用Python编码的支持向量机(SVM)。由于这是一种多标签分类方法,我为每个标签创建了一个SVM模型。现在,当我测试SVM模型时,我不明白为什么我的一些标签f1分数很低。一个原因可能是我的训练数据可能没有我想象的那么准确。但这足以证明问题的合理性吗?请帮我想想为什么会发生这种情况。我刚开始学习机器学习,对这方面我还是新手。请帮我通过论文。多谢各位

附件是Average f1 scores per label的图像

以下是我的代码的一些部分:

    # Connect to the database

train, test = train_test_split(df, random_state=42, train_size=0.80, test_size=0.20, shuffle=True)

train_text = train['question_body']
test_text = test['question_body']

vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words=stop_words, analyzer='word', ngram_range=(1,3), norm='l2', min_df=15)
vectorizer.fit(train_text)
pickle.dump(vectorizer, open('./data-models/vectorizer.sav', 'wb'))

x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['question_body'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['question_body'], axis=1)

# # Using pipeline for applying linearSVC and one vs rest classifier
SVC_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)),
            ])
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])
# Using pipeline for applying Gaussian Naive Bayes and one vs rest classifier
NB_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(MultinomialNB(), n_jobs=-1)),
            ])

# Store models of each category
# filename = <model name> + <category name> + '.sav'
param_grid = {'clf__estimator__C': np.arange(1,5), 
                'clf__estimator__tol': [1, 0.01, 0.001, 0.0001, 0.00000001]}
nb_param_grid = {'clf__estimator__alpha': [1, 1e-1, 1e-2]}

for category in categories:

    print('... Processing {}'.format(category))
    svc_clf_cv = GridSearchCV(SVC_pipeline, param_grid, cv=10, scoring='f1_macro')
    svc_clf_cv.fit(x_train, train[category])
    print("SVC:")
    print("Tuned Parameters: {}".format(svc_clf_cv.best_params_))

    lr_clf_cv = GridSearchCV(LogReg_pipeline, param_grid, cv=10, scoring='f1_macro')
    lr_clf_cv.fit(x_train, train[category])
    print("Log Reg:")
    print("Tuned Parameters: {}".format(lr_clf_cv.best_params_))


    nb_clf_cv = GridSearchCV(NB_pipeline, nb_param_grid, cv=10, scoring='f1_macro')
    nb_clf_cv.fit(x_train, train[category])
    print("NB:")
    print("Tuned Parameters: {}".format(nb_clf_cv.best_params_))

    #Using the tuned parameters I created a model:
    SVC2_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(C=svc_clf_cv.best_params_.get('clf__estimator__C'), tol = svc_clf_cv.best_params_.get('clf__estimator__tol')), n_jobs=-1)),
            ])
    SVC2_pipeline.fit(x_train, train[category])

    # Store models of each category
    # filename = <model name> + <category name> + '.sav'
    filename = 'svc-' + category + '.sav'
    pickle.dump(SVC2_pipeline, open('./data-models/' + filename, 'wb'))

    # Using pipeline for applying logistic regression and one vs rest classifier
    LogReg2_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', C=lr_clf_cv.best_params_.get('clf__estimator__C'), tol = lr_clf_cv.best_params_.get('clf__estimator__tol')), n_jobs=-1)),
            ])
    LogReg2_pipeline.fit(x_train, train[category])
    # Store models of each category
    # filename = <model name> + <category name> + '.sav'
    filename = 'lr-' + category + '.sav'
    pickle.dump(LogReg2_pipeline, open('./data-models/' + filename, 'wb'))


    # Using pipeline for applying logistic regression and one vs rest classifier
    NB2_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(MultinomialNB(alpha=nb_clf_cv.best_params_.get('clf__estimator__alpha')), n_jobs=-1)),
            ])
    NB2_pipeline.fit(x_train, train[category])
    # Store models of each category
    # filename = <model name> + <category name> + '.sav'
    filename = 'nb-' + category + '.sav'
    pickle.dump(NB2_pipeline, open('./data-models/' + filename, 'wb'))

Tags: nametestpipelinemodelstrainparamsfilenamecv