如何在没有selenium的情况下从aspx页面分页和刮取数据

2024-04-26 21:19:56 发布

您现在位置:Python中文网/ 问答频道 /正文

我正试图从这个网站上抓取每个表的详细信息。 https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx

截图

我需要单击每个详细信息框,转到一个新窗口,并对每个页面中的其他记录执行此操作。然后分页。这是我的selenium代码

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options

PATH = 'chromedriver.exe'

options = Options()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("--lang=en")

driver = webdriver.Chrome(executable_path=PATH, options=options)
driver.maximize_window()


driver.get('https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx')
driver.find_element_by_xpath('//*[@id="Div1"]/input').click()

def wait(locator, id):
    element = WebDriverWait(driver, 50).until(
                EC.presence_of_all_elements_located((locator, id))
    )
    return element




DATA = []

name = '//*[@id="ctl00_ContentPlaceHolder1_TxtName_I"]'
postal = '//*[@id="ctl00_ContentPlaceHolder1_TxtPostalCode_I"]'
fax = '//*[@id="ctl00_ContentPlaceHolder1_TxtFax_I"]'
province = '//*[@id="ctl00_ContentPlaceHolder1_TxtDistrict_I"]'
email = '//*[@id="ctl00_ContentPlaceHolder1_TxtEmail_I"]'
address = '//*[@id="ctl00_ContentPlaceHolder1_TxtAddress_I"]'
phone = '//*[@id="ctl00_ContentPlaceHolder1_TxtPhone_I"]'
courtroom = '//*[@id="ctl00_ContentPlaceHolder1_TxtCourtBox_I"]'
webpage = '//*[@id="ctl00_ContentPlaceHolder1_TxtUrl_I"]'

details = ['Postal Code', 'Fax', 'Calendar Province', 'Email', 'Address', 'Phone', 'Courtroom', 'Webpage']

def gotopage(page):
    for p in range(page-1):
        next_page = driver.find_element_by_class_name('dxWeb_pNext_Material')
        action = ActionChains(driver)
        action.click(next_page)
        action.perform()

        time.sleep(4)

def each_page(page, new):
    global DATA
    curr = 0
    
    while curr < 80:
        if page > 1 and new:
            gotopage(page)

        action = ActionChains(driver)
        action.move_to_element(driver.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_LawyersGrid_DXPagerBottom_PSI"]')).click()
        action.perform()

        action.send_keys(Keys.ARROW_UP, Keys.RETURN)
        action.perform()

        time.sleep(17)
        data = {}
        action = ActionChains(driver)

        
        detail_list = wait(By.CLASS_NAME, 'dxb-hbc')
        try:
            action.click(detail_list[curr])

            action.perform()
        except IndexError:
            print(curr)
            driver.back()
            gotopage(page)
        
        data['Name'] = wait(By.XPATH, name)[0].get_attribute('value')
    
        for i, d in enumerate([postal, fax, province, email, address, phone, courtroom, webpage]):

            info = driver.find_element_by_xpath(d).get_attribute(('value'))
            data[details[i]] = info
        
        DATA.append(data)
        curr += 1
        driver.back()
    
        

print('============SCRAPING===============')

page = 1
new=True
while page <= 50:
    try:
        each_page(page, new)
        page += 1
    except Exception as err:
        print(err)
        print(page)
      

这里的问题是,这是难以置信的缓慢,因为每次你说 driver.back() 它返回到第1页,我需要返回到当前页面,它需要返回到它所在的页面

有没有什么方法可以让我用BeautifulSoup这样的东西来达到这个目的