空列表作为输出，同时使用scrapy删除html元素

class FlightDestinationSpider(scrapy.Spider): name = 'flights_destinations' start_urls = [ "https:/xxxxxxxxxxxx" ] def parse(self, response): # flights = HuxyscraperItem() # div_flight_table = response.css('div.detail-table__row') time1 = response.css('.fdabf-td2').css('::text').extract() flight = response.css('.fdabf-td3::text').extract() yield {'time': time1, 'flug': flight}

# -*- coding: utf-8 -*- # Scrapy settings for huxyscraper project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'huxyscraper' SPIDER_MODULES = ['huxyscraper.spiders'] # NEWSPIDER_MODULE = 'huxyscraper.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' # Obey robots.txt rules ROBOTSTXT_OBEY = False PROXY_POOL_ENABLED = True # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: # DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', # } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'huxyscraper.middlewares.HuxyscraperSpiderMiddleware': 543, # } # DOWNLOADER_MIDDLEWARES = { # # ... # 'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610, # 'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620, # # ... # } DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400, } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # 'huxyscraper.middlewares.HuxyscraperDownloaderMiddleware': 543, # } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, # } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'huxyscraper.pipelines.HuxyscraperPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

1条回答

网友

1楼 · 发布于 2024-04-25 14:03:32

您想要爬网this而不是this，因为它从that one加载数据，所以您会得到空列表，因为您想要的数据在that page中不存在

import scrapy
import json

class FlightDestinationSpider(scrapy.Spider):
    name = 'flights_destinations'
    start_urls = [
        "https://xxxxxxxxx.com"
    ]

    def parse(self, response):
        json_data = json.loads(response.body)
        for data in json_data['monitor']['departure']:
            time1 = data['scheduledatetime']
            flight = data['airline']
            yield {'time': time1, 'flug': flight}

在创建您的爬行器之前，最好通过scrapy shell逐view(response)查看网站，以了解您在request时得到了什么

相关问题更多 >

编程相关推荐

热门问题

热门文章