正在为大学体育课编写脚本,不断收到错误“AttributeError:模块“scrapy”没有属性“spider”

2024-05-14 11:09:15 发布

您现在位置:Python中文网/ 问答频道 /正文

这是我的代码,不确定我在这里做错了什么。谢谢你的帮助

from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd

url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)

class WebSpider(scrapy.spider):
    name = "Web_Spider"
    allowed_domains = ['https://www.ufc.com/athletes']
    start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']

    def __init__(self):
        self.driver = driver

    def parse(self, response):
        self.driver.get(response.url)

        while True:
            next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')

            try:
                next.click()

            except:
                break

        self.driver.close()

我一直收到错误'AttributeError:模块'scrapy'没有属性'spider'。不确定在这里要做什么,Scrapy安装正确并且是最新的


Tags: fromhttpsimportselfdivcomurlwww
3条回答

根据您要做的事情,我不会在这里使用Selenium,因为您可以直接通过ajax获取数据。硒仍能发挥作用,但它的杀伤力有点过大,效率较低

试试这个:

import requests
from bs4 import BeautifulSoup
import re


url = 'https://www.ufc.com/views/ajax?_wrapper_format=drupal_ajax'
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36'}

page = 1
end_of_load = False
while end_of_load == False:
    payload = {
    'view_name': 'all_athletes',
    'view_display_id': 'page',
    'view_path': '/athletes/all',
    'pager_element': '0',
    'gender': 'All',
    'page': '%s' %page}
    
    
    jsonData = requests.post(url, headers=headers, data=payload).json()
    print('Page: %s' %page)
    page += 1
    
    html = jsonData[-1]['data']

    soup = BeautifulSoup(html, 'html.parser')
    
    player_cards = soup.find_all('div',{'class':re.compile('.*view-mode-all-athletes-result.*')})
    if not player_cards:
        end_of_load = True
  
    else:
        for player_card in player_cards:
            name = player_card.find('span',{'class':re.compile('.*athlete__name.*')}).text.strip()
            try:
                weight_class = player_card.find('div',{'class':re.compile('.*weight-class.*')}).text.strip()
            except:
                weight_class = 'N/A'
            try:
                record = player_card.find('span',{'class':re.compile('.*athlete__record.*')}).text.strip()
            except:
                record = 'N/A'
            print('\t%s - %s\t%s' %(name,weight_class,record))
Try now: 

from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd

url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)

class WebSpider(scrapy.Spider):
    name = "Web_Spider"
    allowed_domains = ['https://www.ufc.com/athletes']
    start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']

    def __init__(self):
        self.driver = driver

    def parse(self, response):
        self.driver.get(response.url)

        while True:
            next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')

            try:
                next.click()

            except:
                break

        self.driver.close()

是刮毛的蜘蛛,大写“s”

相关问题 更多 >

    热门问题