Python/scray句柄缺少表数据，列表索引超出范围

from scrapy.selector import Selector from scrapy.spiders import Spider from bigcrawler.items import BigcrawlerItem class CrawlbotSpider(Spider): name = 'bigcrawler' allowed_domains = ['www.matchstat.com'] start_urls = ['https://matchstat.com/tennis/all-upcoming-matches'] custom_settings = { 'FEED_FORMAT': 'csv' , 'FEED_URI': 'test.csv' } def parse(self ,response): hxs = Selector(response) item = BigcrawlerItem() round_col = hxs.xpath(".//tr[contains(@class, 'match')]/td[contains(@class, 'round')]/text()").extract() event_col = hxs.xpath(".//tr[contains(@class, 'match')]/td[contains(@class, 'event-name')]/a/text()").extract() player1_col = hxs.xpath(".//tr[contains(@class, 'match')]/td[contains(@class, 'player-name')][1]/a/text()").extract() player2_col = hxs.xpath(".//tr[contains(@class, 'match')]/td[contains(@class, 'player-name')][2]/a/text()").extract() odds1_col = hxs.xpath(".//tr[contains(@class, 'match highlight')]/td[contains(@class, 'odds-td odds-0')]/a[contains(@class, 'btn btn-default virtual btn-xs btn-outcome odds')][1]/text()").extract_first() odds2_col = hxs.xpath( ".//tr[contains(@class, 'match highlight')]/td[contains(@class, 'odds-td odds-1')]/a[contains(@class, 'btn btn-default virtual btn-xs btn-outcome odds')][2]/text()").extract_first() h2h_col = hxs.xpath(".//tr[contains(@class, 'match')]/td[contains(@class, 'h2h')]/a[contains(@class, 'h2h')]/text()").extract_first() for x in range(0,len(round_col)): item['round'] = round_col[x].strip() item['event1'] = event_col[x].strip() item['player_1'] = player1_col[x].strip() item['player_2'] = player2_col[x].strip() item['player_1_odds'] = odds1_col[x].strip() item['player_2_odds'] = odds2_col[x].strip() item['h_2_h'] = h2h_col[x].strip() yield item

3条回答

网友

1楼 · 编辑于 2024-04-26 18:13:35

如果以这种方式提取数据（用此代码替换解析方法主体，并根据需要修改它），则效果更好：

data = []
rows = response.xpath('//tr[contains(@class, "match ")]')
for i in rows:
    round_col = i.css("td.round::text").extract_first()
    event_col = i.css("td.event-name::text").extract_first()
    players = i.css("td.player-name")
    player1_col = players[0].css("a::text").extract_first()
    player2_col = players[0].css("a::text").extract_first()
    odds1_col = i.css("td.odds-td odds-0 a.btn.btn-default.virtual.btn-xs.btn-outcome.odds::text").extract_first()
    odds2_col = i.css("td.odds-td odds-1 a.btn.btn-default.virtual.btn-xs.btn-outcome.odds::text").extract_first()
    h2h_col = i.css("td.h2h a.h2h::text").extract_first()    
    data.append({'round': round_col, 'event': event_col, 'player1': player1_col, 'player2': player2_col, 'odds1':odds1_col, 'odds2':odds2_col, 'h2h':h2h_col})

print data

网友

2楼 · 编辑于 2024-04-26 18:13:35

与尝试基于列构建不同，您需要遍历行并开始构建每行的项，我在这里使用了项加载器，这样您可以避免几个.extract_first()或{}，请尝试以下操作：

# -*- coding: utf-8 -*-
import scrapy

# This should go to items.py
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from operator import methodcaller


class MatchStatItem(scrapy.Item):
    round = scrapy.Field()
    event1 = scrapy.Field()
    player_1 = scrapy.Field()
    player_2 = scrapy.Field()
    player_1_odds = scrapy.Field()
    player_2_odds = scrapy.Field()
    h_2_h = scrapy.Field()


class MatchStatItemLoader(ItemLoader):
    default_item_class = MatchStatItem
    default_input_processor = MapCompose(methodcaller('strip'))
    default_output_processor = TakeFirst()

class MatchStatSpider(scrapy.Spider):
    name = "matchstat"
    allowed_domains = ["matchstat.com"]
    start_urls = ['https://matchstat.com/tennis/all-upcoming-matches']

    def parse(self, response):  
        for row in response.css('tr.match'):
           il = MatchStatItemLoader(selector=row)
           il.add_css('round', '.round::text')
           il.add_css('event1', '.event-name a::text')
           il.add_css('player_1', '.player-name:nth-child(3) a::text')
           il.add_css('player_2', '.player-name:nth-child(4) a::text')
           il.add_css('player_1_odds', '.odds-td.odds-0 [payout]::text')
           il.add_css('player_2_odds', '.odds-td.odds-1 [payout]::text')
           il.add_css('h_2_h', 'a.h2h::text')
           yield il.load_item()

输出应该类似于：

^{pr2}$

网友

3楼 · 编辑于 2024-04-26 18:13:35

我建议您像这样使用CSS选择器

for match in response.css("table.draw-table.filter-table tr.match):
    item['round'] = "".join(a.strip() for a in match.css("td.round *::text").extract())
    #do the other TDs just like above
    yield item

这样您就不会产生任何错误，如果某个TD的数据不存在，它将在您的输出CSV中显示为空

相关问题更多 >

编程相关推荐

热门问题

热门文章