我正试图从这个网站上抓取每个表的详细信息。 https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx
截图
我需要单击每个详细信息框,转到一个新窗口,并对每个页面中的其他记录执行此操作。然后分页。这是我的selenium代码
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
PATH = 'chromedriver.exe'
options = Options()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("--lang=en")
driver = webdriver.Chrome(executable_path=PATH, options=options)
driver.maximize_window()
driver.get('https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx')
driver.find_element_by_xpath('//*[@id="Div1"]/input').click()
def wait(locator, id):
element = WebDriverWait(driver, 50).until(
EC.presence_of_all_elements_located((locator, id))
)
return element
DATA = []
name = '//*[@id="ctl00_ContentPlaceHolder1_TxtName_I"]'
postal = '//*[@id="ctl00_ContentPlaceHolder1_TxtPostalCode_I"]'
fax = '//*[@id="ctl00_ContentPlaceHolder1_TxtFax_I"]'
province = '//*[@id="ctl00_ContentPlaceHolder1_TxtDistrict_I"]'
email = '//*[@id="ctl00_ContentPlaceHolder1_TxtEmail_I"]'
address = '//*[@id="ctl00_ContentPlaceHolder1_TxtAddress_I"]'
phone = '//*[@id="ctl00_ContentPlaceHolder1_TxtPhone_I"]'
courtroom = '//*[@id="ctl00_ContentPlaceHolder1_TxtCourtBox_I"]'
webpage = '//*[@id="ctl00_ContentPlaceHolder1_TxtUrl_I"]'
details = ['Postal Code', 'Fax', 'Calendar Province', 'Email', 'Address', 'Phone', 'Courtroom', 'Webpage']
def gotopage(page):
for p in range(page-1):
next_page = driver.find_element_by_class_name('dxWeb_pNext_Material')
action = ActionChains(driver)
action.click(next_page)
action.perform()
time.sleep(4)
def each_page(page, new):
global DATA
curr = 0
while curr < 80:
if page > 1 and new:
gotopage(page)
action = ActionChains(driver)
action.move_to_element(driver.find_element_by_xpath('//*[@id="ctl00_ContentPlaceHolder1_LawyersGrid_DXPagerBottom_PSI"]')).click()
action.perform()
action.send_keys(Keys.ARROW_UP, Keys.RETURN)
action.perform()
time.sleep(17)
data = {}
action = ActionChains(driver)
detail_list = wait(By.CLASS_NAME, 'dxb-hbc')
try:
action.click(detail_list[curr])
action.perform()
except IndexError:
print(curr)
driver.back()
gotopage(page)
data['Name'] = wait(By.XPATH, name)[0].get_attribute('value')
for i, d in enumerate([postal, fax, province, email, address, phone, courtroom, webpage]):
info = driver.find_element_by_xpath(d).get_attribute(('value'))
data[details[i]] = info
DATA.append(data)
curr += 1
driver.back()
print('============SCRAPING===============')
page = 1
new=True
while page <= 50:
try:
each_page(page, new)
page += 1
except Exception as err:
print(err)
print(page)
这里的问题是,这是难以置信的缓慢,因为每次你说
driver.back()
它返回到第1页,我需要返回到当前页面,它需要返回到它所在的页面
有没有什么方法可以让我用BeautifulSoup这样的东西来达到这个目的
目前没有回答
相关问题 更多 >
编程相关推荐