Python:索引超出范围E

import re from re import sub import time import cookielib from cookielib import CookieJar import urllib2 from urllib2 import urlopen import difflib import sys cj = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) opener.addheaders = [('User-agent', 'Mozilla/5.0')] keyWord = ["Scotch"] def main(): i=0 while i<len(keyWord): startingLink = 'https://twitter.com/search/realtime?q='+keyWord[i] tUrl = startingLink+'&src=hash' oldTwit = [] newTwit = [] howSimAr = [.5,.5,.5,.5,.5] sourceCode = opener.open(tUrl).read() splitSource = re.findall(r'<p class="js-tweet-text tweet-text">(.*?)</p>',sourceCode) timeSource = re.findall(r'js-nav" title="(.*?)"',sourceCode) if ( len(splitSource) > 20 ): max_range = 19 else: max_range = len(splitSource) print '' print '' print '' ##print 'Keyword: ' + keyWord[i] print '' for x in range (0, max_range): aTweet = re.sub(r'<.*?>','',splitSource[x]) print (aTweet + '~' + timeSource[x] + '~' + keyWord[i]) #print ';' newTwit.append(aTweet) ## comparison = difflib.SequenceMatcher(None, newTwit, oldTwit) ## howSim = comparison.ratio() ## print ';' ## print 'This selection is',howSim,'similar to the past' ## howSimAr.append(howSim) ## howSimAr.remove(howSimAr[0]) ## ## waitMultiplier = reduce(lambda x, y: x+y, howSimAr)/len(howSimAr) ## ## print '' ## print 'The current similarity array:',howSimAr ## print 'Our current Multiplier:', waitMultiplier oldTwit = [None] for eachItem in newTwit: oldTwit.append(eachItem) newTwit = [None] time.sleep(2) x = 0 i = i + 1 ## except Exception, e: ## print str(e) ## print 'errored in the main try' main()

1条回答

网友

1楼 · 发布于 2024-04-20 08:35:59

twitter搜索页面的源代码中没有出现js-nav" title="，因此第二个regexp将找不到任何内容。事实上

print "len(timeSource) =", len(timeSource)
print "max_range =", max_range

在

for x in range (0, max_range):

将显示：

len(timeSource) = 0
max_range = 20

无论您想归档什么，最好使用HTMLParser左右的文档来处理HTML，而不是使用re。这将更容易确保timeSource[x]和splitSource[x]在所有x中都属于彼此。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章