Splash for Scrapy仅返回空列表

import scrapy from ..items import TestItem from scrapy_splash import SplashRequest class HoteldataSpider (scrapy.Spider): name = "Testdata" start_urls = ["https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"] def start_requests(self): for url in self.start_urls: yield SplashRequest(url=url, callback=self.parse, args={"wait": 5}) def parse(self, response): items = TestItem() all_single_entries = response.css("div.listItem") for entry in all_single_entries: hotel_names = entry.css(".listing_title [target=_blank]::text").extract() hotel_links = entry.css(".listing_title a").xpath("@href").extract() hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract() hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid)").extract() items["hotel_names"] = str(hotel_names).split("'")[1] items["hotel_links"] = "https://www.tripadvisor.com" + str(hotel_links).split("'")[1] items["hotel_ids"] = int(str(hotel_ids).split("_")[1].split("'")[0]) items["hotel_displayed_price"]= hotel_displayed_price yield items

3条回答

网友

1楼 · 编辑于 2024-04-19 03:18:27

在这条线上

hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid").extract()

“div:：attr（数据位置ID）”是否缺少右括号

网友

2楼 · 编辑于 2024-04-19 03:18:27

对于每个有类似问题的人：这里是我的解决方案。但是，在运行脚本时，我确实遇到了重复的问题

import scrapy
from ..items import HotelinfoItem
from scrapy_splash import SplashRequest

class HoteldataSpider (scrapy.Spider):
    name = "Hoteldata"
    start_urls = ["http://localhost:8050/render.html?url=https:"
                  "//www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url=url, callback=self.parse, args={"wait": 10})

    def parse(self, response):
        items = HotelinfoItem()

        all_single_entries = response.css("div.listItem")

        for entry in all_single_entries:
            hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
            hotel_links = entry.css(".listing_title a").xpath("@href").extract()
            hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
            hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-pernight)").extract()
            hotel_type = entry.css(".mb10").css(".label::text").extract()


            items["hotel_names"] = [str(hotel_names).split("'")[1]]
            items["hotel_links"] = ["https://www.tripadvisor.com" + str(hotel_links).split("'")[1]]
            items["hotel_ids"] = [str(hotel_ids).split("_")[1].split("'")[0]]

            if len(hotel_type) == 0:
                items["hotel_type"] = ["Hotel"]
            else:
                items["hotel_type"] = hotel_type

            if len(hotel_displayed_price) == 0:
                items["hotel_displayed_price"] = ["NA"]
            else:
                items["hotel_displayed_price"] = hotel_displayed_price

            yield items

        next_page = response.css("a.next::attr(href)").get()
        next_page_splash = "http://localhost:8050/render.html?url=https://www.tripadvisor.com" + \
                           str(next_page).split("#")[0] + "&timeout=10&wait=5"

        if next_page is not None:
            yield response.follow(next_page_splash, callback=self.parse)

网友

3楼 · 编辑于 2024-04-19 03:18:27

我已经查看了scrapy下的行为，价格不会在HTML中返回给scrapy的请求。您在浏览器中看到的内容（甚至是Splash）与您的代码看到的内容不一样

我对scrapy不太了解，无法解决这个问题，但似乎可以通过简单的旧请求获得您所需要的&；美丽小组：

import requests
import BeautifulSoup

r = requests.get('https://www.tripadvisor.ie/Hotels-g189541-Copenhagen_Zealand-Hotels.html')
soup = BeautifulSoup(requests.content, 'lxml')
prices = [price.text for price in soup.select('.price-wrap .price')]

print(prices)
['€131', '€112', '€121', '€133', '€172', '€169', '€74', '€189', ...]

相关问题更多 >

编程相关推荐

热门问题

热门文章