使用Scrapy Splash和Lua在mouseclick上动态加载内容

<div id="slider"> <div class="slider-inner"> <div class="item active"> <img src="https://www.example.com/images/1.jpg"> </div> <div class="item"> <img src="https://www.example.com/images/2.jpg"> </div> </div> </div>

click_script = """ function main(splash, args) btn = splash:select_all('#showphotos')[0] btn:mouse_click() assert(splash:wait(0.5)) return { num = #splash:select_all('#slider div.slider-inner'), html = splash:html() } end """

import json import re import scrapy import time from scrapy_splash import SplashRequest from scrapy.selector import Selector from scrapy.http import HtmlResponse from myresults.items import MyResultItem class Spider(scrapy.Spider): name = 'myscraper' allowed_domains = ['example.com'] start_urls = ['https://www.example.com/results'] def start_requests(self): # lua script for scroll to bottom while all objects appeared lua_script = """ function main(splash, args) local object_count = 0 local url = splash.args.url splash:go(url) splash:wait(0.5) local get_object_count = splash:jsfunc([[ function (){ var objects = document.getElementsByClassName("object-adres"); return objects.length; } ]]) temp_object_count = get_object_count() local retry = 3 while object_count ~= temp_object_count do splash:evaljs('window.scrollTo(0, document.body.scrollHeight);') splash:wait(0.5) object_count = temp_object_count temp_object_count = get_object_count() end return splash:html() end """ # yield first splash request with lua script and parse it from parse def yield SplashRequest( self.start_urls[0], self.parse, endpoint='execute', args={'lua_source': lua_script}, ) def parse(self, response): # get all properties from first page which was generated with lua script # get all adreslink from a tag object_links = response.css('a.adreslink::attr(href)').getall() for link in object_links: # send request with each link and parse it from parse_object def yield scrapy.Request(link, self.parse_object) def parse_object(self, response): # create new MyResultItem which will saved to json file item = MyResultItem() item['url'] = response.url # get url yield item

1条回答

网友

1楼 · 发布于 2024-04-19 06:15:07

Lua脚本像Python脚本一样运行。在Spider -> start_requests -> lua_script中，您已经有了一个Lua脚本。您希望选择第一个#showphotos元素并单击它；此外，您希望向结果中添加更多数据

因此，在执行已经存在的Lua代码之后，我们希望告诉Splash选择第一个#showphotos元素：

btn = splash:select_all('#showphotos')[1]

请注意索引1，而不是0，因为splash:select_all数组从1开始

之后，单击它：

btn:mouse_click()

最后，在结果中添加更多数据：

return {
    num = splash:select_all('#slider div.slider-inner')[1].node.outerHTML,
    html = splash:html()
}

请再次注意索引1，而不是0，因为splash:select_all数组从1开始。另外，我添加了.node.outerHTML，因为splash:select_all()返回一个Lua对象，并且没有默认的方式将其序列化为JSON（ref

最后，您应该得到如下结果：

function main(splash, args)
  local object_count = 0
  local url = splash.args.url
  splash:go(url)
  splash:wait(0.5)

  local get_object_count = splash:jsfunc([[
    function (){
      var objects = document.getElementsByClassName("object-adres");
      return objects.length;
    }
  ]])
  temp_object_count = get_object_count()
  local retry = 3
  while object_count ~= temp_object_count do
    splash:evaljs('window.scrollTo(0, document.body.scrollHeight);')
    splash:wait(0.5)
    object_count = temp_object_count
    temp_object_count = get_object_count()
  end

  btn = splash:select_all('#showphotos')[1]
  btn:mouse_click()
  assert(splash:wait(0.5))
  
  return {
    num = splash:select_all('#slider div.slider-inner')[1].node.outerHTML,
    html = splash:html()
  }
end

相关问题更多 >

编程相关推荐

热门问题

热门文章