刮擦不跟随链接

import urlparse from scrapy.spider import BaseSpider from scrapy.selector import Selector from scrapy.http import Request import w3lib.url from yelp.items import YelpItem class YelpSpider(BaseSpider): name = "yelp" download_delay = 10 concurrent_requests = 1 concurrent_requests_per_domain = 1 allowed_domains = ["yelp.com"] start_urls = ["http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=0", "http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=20", "http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=30"] def parse(self, response): selector = Selector(response) for title in selector.css("span.indexed-biz-name"): page_url = urlparse.urljoin(response.url, title.xpath("a/@href").extract()[0]) self.log("page URL: %s" % page_url) #continue yield Request(page_url, callback=self.parse_page) for next_page in selector.css(u'ul > li > a.prev-next:contains(\u2192)'): next_url = urlparse.urljoin(response.url, next_page.xpath('@href').extract()[0]) self.log("next URL: %s" % next_url) #continue yield Request(next_url, callback=self.parse) def parse_page(self, response): selector = Selector(response) item = YelpItem() item["name"] = selector.xpath('.//h1[@itemprop="name"]/text()').extract()[0].strip() item["addresslocality"] = u"\n".join( selector.xpath('.//address[@itemprop="address"]//text()').extract()).strip() item["link"] = response.url website = selector.css('div.biz-website a') if website: website_url = website.xpath('@href').extract()[0] item["website"] = w3lib.url.url_query_parameter(website_url, "url") return item

1条回答

网友

1楼 · 发布于 2024-06-07 14:38:32

下一个URL提取和选择逻辑不正确。以具有next和pagination-links_anchor类的link元素为目标。以下是我的工作：

next_url = response.css('a.pagination-links_anchor.next::attr(href)').extract_first()
if next_url:
    next_url = urlparse.urljoin(response.url, next_url)
    self.log("next URL: %s" % next_url)
    yield Request(next_url, callback=self.parse)

相关问题更多 >

编程相关推荐

热门问题

热门文章