使用Python和Selenium查找web表

import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC st = 'ut' url = 'https://coronavirus.utah.gov/case-counts/' timeout = 20 # Spawn the webpage using Selenium driver = webdriver.Chrome(r'D:\Work\Python\utilities\chromedriver\chromedriver.exe') driver.minimize_window() driver.get(url) # Let page load . . . it takes a while wait = WebDriverWait(driver, timeout).until(EC.visibility_of_element_located()((By.ID, "total-number-of-lab-confirmed-covid-19-cases-living-in-utah"))) # Now, scrape table html = driver.find_element_by_id("total-number-of-lab-confirmed-covid-19-cases-living-in-utah") soup = BeautifulSoup(html, 'lxml') table = soup.find_all('table', id='#DataTables_Table_0') df = pd.read_html(str(table)) exec(st + "_counts = df[0]") tmp_str = f"{st}_counts.to_csv(r'D:\Work\Python\projects\Covid_WebScraping\output\{st}_covid_cnts_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'" file_path = tmp_str + ", index=False)" exec(file_path) # Close the chrome web driver driver.close()

1条回答

网友

1楼 · 发布于 2024-06-09 21:37:17

我找到了另一种方法来获取我需要的信息

感谢朱利安·斯坦利让我了解卡塔隆录音机产品。这让我看到了iframe是什么，桌子在哪里

使用CSS或XPATH查找元素的旧方法会由于线程锁定而导致Pickle错误。我不知道该怎么处理。但是，这导致整个项目停滞不前

但是，我能够通过属性获取表的text/HTML。之后，我像往常一样用BS4阅读

import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

st = 'ut'
url = 'https://coronavirus.utah.gov/case-counts/'
timeout = 20

# Spawn the webpage using Selenium
driver = webdriver.Chrome(r'D:\Work\Python\utilities\chromedriver\chromedriver.exe')
#driver.minimize_window()
driver.get(url)


# Let page load . . . it takes a while
wait = WebDriverWait(driver, timeout)

# Get name of frame (or use index=0)
frames = [frame.get_attribute('id') for frame in driver.find_elements_by_tag_name('iframe')]

# Switch to frame
#driver.switch_to_frame("coronavirus-dashboard")
driver.switch_to_frame(0)

# Now, scrape table
html = driver.find_element_by_css_selector('#DataTables_Table_0_wrapper').get_attribute('innerHTML')
soup = BeautifulSoup(html, 'lxml')
table = soup.find_all('table', id='DataTables_Table_0')
df = pd.read_html(str(table))
exec(st + "_counts = df[0]")

tmp_str = f"{st}_counts.to_csv(r'D:\Work\Python\projects\Covid_WebScraping\output\{st}_covid_cnts_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'"
file_path = tmp_str + ", index=False)"

exec(file_path)

# Close the chrome web driver
driver.close()

相关问题更多 >

编程相关推荐

热门问题

热门文章