刮痧爬只有第1页

from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from project2.items import Project2Item class ProjectSpider(BaseSpider): name = "project2spider" allowed_domains = ["http://directory.thesun.co.uk/"] start_urls = [ "http://directory.thesun.co.uk/find/uk/computer-repair" ] def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//div[@class="abTbl "]') items = [] for site in sites: item = Project2Item() item['Catogory'] = site.select('span[@class="icListBusType"]/text()').extract() item['Bussiness_name'] = site.select('a/@title').extract() item['Description'] = site.select('span[last()]/text()').extract() item['Number'] = site.select('span[@class="searchInfoLabel"]/span/@id').extract() item['Web_url'] = site.select('span[@class="searchInfoLabel"]/a/@href').extract() item['adress_name'] = site.select('span[@class="searchInfoLabel"]/span/text()').extract() item['Photo_name'] = site.select('img/@alt').extract() item['Photo_path'] = site.select('img/@src').extract() items.append(item) return items

from scrapy.item import Item, Field class Project2Item(Item): Catogory = Field() Bussiness_name = Field() Description = Field() Number = Field() Web_url = Field() adress_name = Field() Photo_name = Field() Photo_path = Field()

1条回答

网友

1楼 · 发布于 2024-04-25 14:07:50

获取描述.select('span/text()')您正在从//div[@class="abTbl "]中的所有跨度中选择文本。要提取最后一个跨度，可以使用'span[last()]/text()'xpath

顺便说一句，这个http://www.w3schools.com/xpath/xpath_syntax.asp应该可以帮助你使用XPathes

相关问题更多 >

编程相关推荐

热门问题

热门文章