优化NLTK代码以从文本中进行预测
我正在尝试建立一个模型,用来预测工作描述的薪水是高于还是低于75百分位(高于1,低于0)。我的数据大约有25万行,而从工作描述中提取所有文本非常困难。我的代码似乎运行得还不错,但处理超过100行的数据时,速度慢得令人难以忍受。我需要找到一种方法,让这个过程更高效,这样我就可以包含更多的数据行来进行预测。
import random
import nltk
import pandas
import csv
import numpy as np
io = pandas.read_csv('Train_rev1.csv',sep=',',usecols=(2,10), nrows=501)
#converted = df.apply(lambda io : int(io[0]))
data = [np.array(x) for x in io.values]
random.shuffle(data)
size = int(len(data) * 0.6)
test_set, train_set = data[size:], data[:size]
train_set = np.array(train_set)
test_set = np.array(test_set)
x = train_set[:,1]
Sal75=np.percentile(x,75)
y = test_set[:,1]
Test75=np.percentile(y,75)
for i in range(len(train_set[:,1])):
if train_set[i,1]>=Sal75:
train_set[i,1]=1
else:
train_set[i,1]=0
for i in range(len(test_set[:,1])):
if test_set[i,1]>=Test75:
test_set[i,1]=1
else:
test_set[i,1]=0
train_setT = [tuple(x) for x in train_set]
test_setT = [tuple(x) for x in test_set]
from nltk.tokenize import word_tokenize
all_words = set(word.lower() for passage in train_setT for word in word_tokenize(passage[0]))
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train_setT]
classifier = nltk.NaiveBayesClassifier.train(t)
all_words2 = set(word.lower() for passage in test_setT for word in word_tokenize(passage[0]))
tt = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in test_setT]
print nltk.classify.accuracy(classifier, tt)
classifier.show_most_informative_features(20)
testres = []
predres = []
for i in range(len(tt)):
testres.append(tt[i][1])
for i in range(len(tt)):
z = classifier.classify(tt[i][0])
predres.append(z)
from nltk.metrics import ConfusionMatrix
cm = nltk.ConfusionMatrix(testres, predres)
print(cm)
这个csv文件是从Kaggle上提取的。使用 Train_rev1
1 个回答
1
在你把数据分成60%和40%之后,你可以做以下事情。这可能需要一些新的工具,可能不需要用到NLTK。
import random
import nltk
import pandas
import csv
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from operator import itemgetter
from sklearn.metrics import classification_report
train_setT = [tuple(x) for x in train_set]
test_setT = [tuple(x) for x in test_set]
train_set = np.array([''.join(el[0]) for el in train_setT])
test_set = np.array([''.join(el[0]) for el in test_setT])
y_train = np.array([el[1] for el in train_setT])
y_test = np.array([el[1] for el in test_setT])
vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1, 2), strip_accents='unicode', norm='l2')
X_train = vectorizer.fit_transform(train_set)
X_test = vectorizer.transform(test_set)
nb_classifier = MultinomialNB().fit(X_train, y_train)
y_nb_predicted = nb_classifier.predict(X_test)
print metrics.confusion_matrix(y_test, y_nb_predicted)
print classification_report(y_test, y_nb_predicted)