我的刮刀产生太多的项目，而不是把它们全部合并成一小堆

1条回答

网友
1楼 · 发布于 2024-06-16 11:38:01

xpath在parse\u结果中出错，这是一个获取所需数据的可运行示例：
import scrapy class PremierleagueItem(scrapy.Item): round = scrapy.Field() date = scrapy.Field() time = scrapy.Field() place = scrapy.Field() opponent = scrapy.Field() results = scrapy.Field() clubName = scrapy.Field() matches = scrapy.Field() class PremierleagueSpider(scrapy.Spider): name = "premierleague" allowed_domains = ["worldfootball.net"] start_urls = [ "http://www.worldfootball.net/competition/eng-premier-league/" ] # get teams in the match def parse(self, response): for sel in response.xpath('//div[@id="tabelle_0"]/div[@class="data"]/table[1]/tr'): clubName = sel.xpath('.//td[3]/a/text()').extract() if clubName: item = PremierleagueItem() item['clubName'] = clubName clubHref = sel.xpath('.//td[2]/a/@href').extract_first() clubUrl = response.urljoin(clubHref) request = scrapy.Request(clubUrl, callback=self.parse_get_historic_results_link) request.meta['item'] = item yield request def parse_get_historic_results_link(self, response): item = response.meta['item'] href2 = response.xpath('//div[@class="navibox2"]/div[@class="data"]/ul[5]/li[2]/a[1]/@href').extract_first() url2 = response.urljoin(href2) request = scrapy.Request(url2, callback=self.parse_seasons) request.meta['item'] = item yield request def parse_seasons(self, response): item = response.meta['item'] for sel in response.xpath('(//table[@class="standard_tabelle"])[1]/tr/td[2]/a'): href = sel.xpath('.//@href').extract_first() url = response.urljoin(href) request = scrapy.Request(url, callback=self.parse_results) request.meta['item'] = item yield request @staticmethod def parse_results(response): item = response.meta['item'] item['matches'] = [] for sel in response.xpath('(//table[@class="standard_tabelle"])[1]/tr[position() > 3]'): matchDict = dict(zip(( 'round', 'date', 'place', 'opponent', 'results'), filter(None, map(unicode.strip,(sel.xpath("./td[normalize-space(.)]//text()").extract()))))) item['matches'].append(matchDict) yield item
输出的一个片段：
{'clubName': [u'Manchester City'], 'matches': [{'date': u'09/09/1911', 'opponent': u'Liverpool FC', 'place': u'A', 'results': u'2:2', 'round': u'2. Round'}, {'date': u'16/09/1911', 'opponent': u'Aston Villa', 'place': u'H', 'results': u'2:6', 'round': u'3. Round'}, {'date': u'23/09/1911', 'opponent': u'Newcastle United', 'place': u'A', 'results': u'0:1', 'round': u'4. Round'}, {'date': u'30/09/1911', 'opponent': u'Sheffield United', 'place': u'H', 'results': u'0:0', 'round': u'5. Round'}, {'date': u'07/10/1911', 'opponent': u'Oldham Athletic', 'place': u'A', 'results': u'1:4', 'round': u'6. Round'}, {'date': u'14/10/1911', 'opponent': u'Bolton Wanderers', 'place': u'H', 'results': u'3:1', 'round': u'8. Round'}, {'date': u'21/10/1911', 'opponent': u'Bradford City', 'place': u'A', 'results': u'1:4', 'round': u'9. Round'}, {'date': u'28/10/1911', 'opponent': u'Woolwich Arsenal', 'place': u'H', 'results': u'3:3', 'round': u'9. Round'}, {'date': u'04/11/1911', 'opponent': u'Preston North End', 'place': u'A', 'results': u'1:2', 'round': u'10. Round'}, {'date': u'11/11/1911', 'opponent': u'Everton FC', 'place': u'A', 'results': u'0:1', 'round': u'12. Round'}, {'date': u'18/11/1911', 'opponent': u'West Bromwich Albion', 'place': u'H', 'results': u'0:2', 'round': u'12. Round'}, {'date': u'25/11/1911', 'opponent': u'Sunderland AFC', 'place': u'A', 'results': u'1:1', 'round': u'13. Round'}, {'date': u'02/12/1911', 'opponent': u'Blackburn Rovers', 'place': u'H', 'results': u'3:0', 'round': u'15. Round'}, {'date': u'09/12/1911', 'opponent': u'Sheffield Wednesday', 'place': u'A', 'results': u'0:3', 'round': u'15. Round'}, {'date': u'16/12/1911', 'opponent': u'Bury FC', 'place': u'H', 'results': u'2:0', 'round': u'16. Round'}, {'date': u'23/12/1911', 'opponent': u'Middlesbrough FC', 'place': u'A', 'results': u'1:3', 'round': u'17. Round'}, {'date': u'25/12/1911', 'opponent': u'Notts County', 'place': u'A', 'results': u'1:0', 'round': u'18. Round'}, {'date': u'26/12/1911', 'opponent': u'Notts County', 'place': u'H', 'results': u'4:0', 'round': u'19. Round'}, {'date': u'30/12/1911', 'opponent': u'Manchester United', 'place': u'A', 'results': u'0:0', 'round': u'20. Round'}, {'date': u'06/01/1912', 'opponent': u'Liverpool FC', 'place': u'H', 'results': u'2:3', 'round': u'21. Round'}, {'date': u'20/01/1912', 'opponent': u'Aston Villa', 'place': u'A', 'results': u'1:3', 'round': u'22. Round'}, {'date': u'27/01/1912', 'opponent': u'Newcastle United', 'place': u'H', 'results': u'1:1', 'round': u'23. Round'}, {'date': u'10/02/1912', 'opponent': u'Oldham Athletic', 'place': u'H', 'results': u'1:3', 'round': u'24. Round'}, {'date': u'17/02/1912', 'opponent': u'Bolton Wanderers', 'place': u'A', 'results': u'1:2', 'round': u'27. Round'}, {'date': u'26/02/1912', 'opponent': u'Sheffield United', 'place': u'A', 'results': u'2:6', 'round': u'26. Round'}, {'date': u'02/03/1912', 'opponent': u'Woolwich Arsenal', 'place': u'A', 'results': u'0:2', 'round': u'28. Round'}, {'date': u'09/03/1912', 'opponent': u'Preston North End', 'place': u'H', 'results': u'0:0', 'round': u'28. Round'}, {'date': u'16/03/1912', 'opponent': u'Everton FC', 'place': u'H', 'results': u'4:0', 'round': u'29. Round'}, {'date': u'23/03/1912', 'opponent': u'West Bromwich Albion', 'place': u'A', 'results': u'1:1', 'round': u'30. Round'}, {'date': u'28/03/1912', 'opponent': u'Bradford City', 'place': u'H', 'results': u'4:0', 'round': u'31. Round'}, {'date': u'30/03/1912', 'opponent': u'Sunderland AFC', 'place': u'H', 'results': u'2:0', 'round': u'32. Round'}, {'date': u'05/04/1912', 'opponent': u'Tottenham Hotspur', 'place': u'H', 'results': u'2:1', 'round': u'33. Round'}, {'date': u'06/04/1912', 'opponent': u'Blackburn Rovers', 'place': u'A', 'results': u'0:2', 'round': u'31. Round'}, {'date': u'08/04/1912', 'opponent': u'Tottenham Hotspur', 'place': u'A', 'results': u'2:0', 'round': u'35. Round'}, {'date': u'13/04/1912', 'opponent': u'Sheffield Wednesday', 'place': u'H', 'results': u'4:0', 'round': u'36. Round'}, {'date': u'20/04/1912', 'opponent': u'Bury FC', 'place': u'A', 'results': u'2:1', 'round': u'37. Round'}, {'date': u'27/04/1912', 'opponent': u'Middlesbrough FC', 'place': u'H', 'results': u'2:0', 'round': u'38. Round'}]}
你需要做更多的工作来获得你想要的精确格式，但是无论你做什么，你都需要使用正确的XPath，你也应该知道你要回到1900年左右，所以会有很多输出可能更适合db。我还从每一页中抽出第一个表格，当有多个表格是联赛结果时，有一些页面只有F.A杯结果等和yput球队等。。。如果你想得到所有的数据，它会是这样的：
for tbl in response.xpath('(//table[@class="standard_tabelle"])'): for sel in tbl.xpath("./tr[position() > 3]"): matchDict = dict(zip(( 'round', 'date', 'place', 'opponent', 'results'), filter(None, map(unicode.strip, (sel.xpath("./td[normalize-space(.)]//text()").extract()))))) item['matches'].append(matchDict) yield item
在第一个表格的下半部分也有一些杯赛成绩，所以如果你只是想要英超联赛：
@staticmethod def parse_results(response): item = response.meta['item'] item['matches'] = [] table = response.xpath('(//table[@class="standard_tabelle"])[1]') for sel in table.xpath("./tr[position() > 3]"): title = sel.xpath("./td/a/@title").extract_first() if title and "premier" not in title.lower(): return matchDict = dict(zip(( 'round', 'date', 'place', 'opponent', 'results'), filter(None, map(unicode.strip, (sel.xpath("./td[normalize-space(.)]//text()").extract()))))) item['matches'].append(matchDict) yield item

相关问题更多 >

编程相关推荐

热门问题

热门文章