Python文本处理:AttributeError:'list'对象没有'lower'属性
我刚接触Python,也刚开始在Stackoverflow上发帖(请多多包涵),现在我想学习如何进行情感分析。我使用的是在一个教程和这里找到的一些代码:Python - AttributeError: 'list' object has no attribute 但是,我总是遇到问题。
Traceback (most recent call last):
File "C:/Python27/training", line 111, in <module>
processedTestTweet = processTweet(row)
File "C:/Python27/training", line 19, in processTweet
tweet = tweet.lower()
AttributeError: 'list' object has no attribute 'lower'`
这是我的代码:
import csv
#import regex
import re
import pprint
import nltk.classify
#start replaceTwoOrMore
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
# process the tweets
def processTweet(tweet):
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert @username to AT_USER
tweet = re.sub('@[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open(stopWordListFileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
def getFeatureVector(tweet, stopWords):
featureVector = []
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if it consists of only words
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
#ignore if it is a stopWord
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
#Read the tweets one by one and process it
inpTweets = csv.reader(open('C:/GsTraining.csv', 'rb'),
delimiter=',',
quotechar='|')
stopWords = getStopWordList('C:/stop.txt')
count = 0;
featureList = []
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet, stopWords)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment))
# Remove featureList duplicates
featureList = list(set(featureList))
# Generate the training set
training_set = nltk.classify.util.apply_features(extract_features, tweets)
# Train the Naive Bayes classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
# Test the classifier
with open('C:/CleanedNewGSMain.txt', 'r') as csvinput:
with open('GSnewmain.csv', 'w') as csvoutput:
writer = csv.writer(csvoutput, lineterminator='\n')
reader = csv.reader(csvinput)
all=[]
row = next(reader)
for row in reader:
processedTestTweet = processTweet(row)
sentiment = NBClassifier.classify(
extract_features(getFeatureVector(processedTestTweet, stopWords)))
row.append(sentiment)
processTweet(row[1])
writer.writerows(all)
任何帮助都会非常感谢。
1 个回答
11
从csv读取的数据结果是一个列表,而lower
这个函数只能作用于字符串。假设这个列表里的确是字符串类型,那么你有两个选择。要么对列表中的每个元素都调用一次lower
,要么把整个列表变成一个字符串,然后再调用lower
。
# the first approach
[item.lower() for item in tweet]
# the second approach
' '.join(tweet).lower()
不过更合理的推测是(没有更多信息很难判断)你其实只想从列表中取出一个元素。大概可以这样做:
for row in reader:
processedTestTweet = processTweet(row[0]) # Again, can't know if this is actually correct without seeing the file
另外,我猜测你可能没有像你想的那样使用csv读取器,因为现在你每次都是在用一个例子来训练朴素贝叶斯分类器,然后让它预测这个它刚训练过的例子。也许你可以解释一下你想要做什么?