Twitter Python 爬虫爬取机制问题
下面是我为我的推特爬虫机制写的一小段代码:
from BeautifulSoup import BeautifulSoup
import re
import urllib2
url = 'http://mobile.twitter.com/NYTimesKrugman'
def gettweets(soup):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
print tag.renderContents()
print ('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more'):
return True
else:
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
if test_stamp == '3 months ago':
print test_stamp
return True
else:
return False
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'stamp' + str(stamp)
print 'tweets' +str (tweets)
while (stamp is False) and (tweets is True):
b = getnewlink(soup)
print b
red = urllib2.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'done'
问题是,当我的推特爬虫抓取到大约3个月的推文后,我希望它停止继续查看用户的下一页。然而,它似乎并没有这样做。它似乎一直在寻找下一页的推文。我认为这可能是因为checkstamp这个检查总是返回False。有没有人能给我一些建议,告诉我如何修改代码,让爬虫在还有更多推文(通过are_more_tweets机制确认)并且还没有达到3个月的推文时,继续寻找下一页的推文呢?谢谢!
编辑 - 请看下面:
from BeautifulSoup import BeautifulSoup
import re
import urllib
url = 'http://mobile.twitter.com/cleversallie'
output = open(r'C:\Python28\testrecursion.txt', 'a')
def gettweets(soup):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
print(b)
print('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more'):
return True
else:
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
if not (test_stamp[0]) in '0123456789':
continue
if test_stamp == '3 months ago':
print test_stamp
return True
else:
return False
response = urllib.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
while (not stamp) and (tweets):
b = getnewlink(soup)
print b
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'done'
1 个回答
1
你的 soup.findall()
方法正在找到一个链接中的图片标签,这个链接符合你的条件(有 href
属性和 class
是 status-link
)。
不要总是对第一个链接直接 return
,可以试试这样:
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
print test_stamp
if not test_stamp[0] in '0123456789':
continue
if test_stamp == '3 months ago':
return True
else:
return False
这样的话,如果链接不是以数字开头,就会跳过它,这样你就可能找到正确的链接。记得保留那个 print
语句,这样你可以看到是否遇到了其他以数字开头的链接,这些链接你也需要过滤掉。
补充说明:你之前的做法是总是对 第一个 times
中的项目 返回。我把它改成了忽略任何不以数字开头的链接。
不过,这样做会导致如果没有找到任何以数字开头的链接时返回 None
。这样是可以的,除了你把 while not stamp and tweets
改成了 while stamp is False and tweets is True
。把它改回 while not stamp and tweets
,这样就能正确地把 None
和 False
视为相同,应该就能正常工作了。