Scrapy与Selenium:仅抓取两页
我想要爬取一个网站,这个网站有超过10个页面。
每个页面上有10个链接,爬虫会获取这些链接,def parse():
然后访问这些链接去爬取我想要的其他数据,def parse_detail():
请教我怎么写代码,只爬取两个页面,而不是所有页面,谢谢!
这是我的代码,它只爬取了一个页面,然后爬虫就停止了。
def __init__(self):
self.driver = webdriver.Firefox()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def parse(self, response):
self.driver.implicitly_wait(20)
self.driver.get(response.url)
sites = self.driver.find_elements_by_css_selector("")
for site in sites:
item = CItem()
linkiwant = site.find_element_by_css_selector(" ")
start = site.find_element_by_css_selector(" ")
item['link'] = linkiwant.get_attribute("href")
item['start_date'] = start.text
yield Request(url=item['link'], meta={'item':item}, callback=self.parse_detail)
#how to write to only catch 2 pages??
i=0
if i< 2:
try:
next = self.driver.find_element_by_xpath("/li[@class='p_next'][1]")
next_page = next.text
if next_page == "next_page":
next.click()
self.driver.refresh()
yield Request(self.driver.current_url, callback=self.parse)
i+=1
except:
print "page not found"
def parse_detail(self,response):
item = response.meta['item']
self.driver.implicitly_wait(20)
self.driver.get(response.url)
sel = Selector(response)
sites = sel.css("")
for site in sites:
item['title'] = site.css(" ").extract()[0]
item['titleURL'] = site.css(" ").extract()[0]
..
yield item
def spider_closed(self, spider):
self.driver.close()
1 个回答
0
让 i
变得持久:
def __init__(self):
self.page_num = 0
self.driver = webdriver.Firefox()
dispatcher.connect(self.spider_closed, signals.spider_closed)
#how to write to only catch 2 pages??
if self.page_num < 2:
try:
next = self.driver.find_element_by_xpath("/li[@class='p_next'][1]")
next_page = next.text
if next_page == "next_page":
next.click()
self.driver.refresh()
yield Request(self.driver.current_url, callback=self.parse)
self.page_num += 1
except:
print "page not found"