利用棕色语料库进行文本分类

2024-04-27 00:31:29 发布

您现在位置:Python中文网/ 问答频道 /正文

我想把语料库的分类作为一项很低的分类任务。我试着用不同的特征来举例说明停止语的频率。你能检查一下我做得对吗?或者我的代码有问题吗?如有任何建议,我们将不胜感激。在

from collections import defaultdict
from nltk.corpus import brown,stopwords
import random
import nltk

dataset = [] # 500 samples

for category in brown.categories():
    for fileid in brown.fileids(category):
        dataset.append((brown.words(fileids = fileid),category))

dataset = [([w.lower() for w in text],category) for text,category in dataset]

def feature_extractor(text,bag):
    # bag -> bag of words
    frec = defaultdict(int)
    for word in text:
        if word in bag:
            frec[word] += 1

    return frec

# training & test 90%-10% naivebayes nltk

def train_and_test(featureset,n=90):

    random.shuffle(featureset)
    split = int((len(featureset)*n)/100)
    train,test = featureset[:split],featureset[split:]
    classifier = nltk.NaiveBayesClassifier.train(train)
    accuracy= nltk.classify.accuracy(classifier, test)
    return accuracy

# Stopwords as features
stopwords = stopwords.words("english") # 153 words

featureset = [(feature_extractor(text,stopwords),category)for text,category in dataset]

print("Accuracy: ",train_and_test(featureset)) # around 0.25

Tags: textintestimportfortraindatasetbag