正在为大学体育课编写脚本，不断收到错误“AttributeError:模块“scrapy”没有属性“spider”

from selenium import webdriver from bs4 import BeautifulSoup import scrapy from scrapy.spiders import Spider import requests import time import xlsxwriter import pandas as pd url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23' driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver') driver.get(url) class WebSpider(scrapy.spider): name = "Web_Spider" allowed_domains = ['https://www.ufc.com/athletes'] start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2'] def __init__(self): self.driver = driver def parse(self, response): self.driver.get(response.url) while True: next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a') try: next.click() except: break self.driver.close()

3条回答

网友

1楼 · 编辑于 2024-05-14 11:09:15

根据您要做的事情，我不会在这里使用Selenium，因为您可以直接通过ajax获取数据。硒仍能发挥作用，但它的杀伤力有点过大，效率较低

试试这个：

import requests
from bs4 import BeautifulSoup
import re


url = 'https://www.ufc.com/views/ajax?_wrapper_format=drupal_ajax'
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36'}

page = 1
end_of_load = False
while end_of_load == False:
    payload = {
    'view_name': 'all_athletes',
    'view_display_id': 'page',
    'view_path': '/athletes/all',
    'pager_element': '0',
    'gender': 'All',
    'page': '%s' %page}
    
    
    jsonData = requests.post(url, headers=headers, data=payload).json()
    print('Page: %s' %page)
    page += 1
    
    html = jsonData[-1]['data']

    soup = BeautifulSoup(html, 'html.parser')
    
    player_cards = soup.find_all('div',{'class':re.compile('.*view-mode-all-athletes-result.*')})
    if not player_cards:
        end_of_load = True
  
    else:
        for player_card in player_cards:
            name = player_card.find('span',{'class':re.compile('.*athlete__name.*')}).text.strip()
            try:
                weight_class = player_card.find('div',{'class':re.compile('.*weight-class.*')}).text.strip()
            except:
                weight_class = 'N/A'
            try:
                record = player_card.find('span',{'class':re.compile('.*athlete__record.*')}).text.strip()
            except:
                record = 'N/A'
            print('\t%s - %s\t%s' %(name,weight_class,record))

网友

2楼 · 编辑于 2024-05-14 11:09:15

Try now: 

from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd

url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)

class WebSpider(scrapy.Spider):
    name = "Web_Spider"
    allowed_domains = ['https://www.ufc.com/athletes']
    start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']

    def __init__(self):
        self.driver = driver

    def parse(self, response):
        self.driver.get(response.url)

        while True:
            next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')

            try:
                next.click()

            except:
                break

        self.driver.close()

网友

3楼 · 编辑于 2024-05-14 11:09:15

是刮毛的蜘蛛，大写“s”

相关问题更多 >

编程相关推荐

热门问题

热门文章