使用Selenium抓取Power BI仪表板
我在用Selenium抓取一个Power BI仪表板时遇到了一些麻烦。看起来我抓取的网址是对的,代码结构也不错,但代码在解析第一列(工作名称)之后,无法成功解析所有其他列。
我不需要点击任何东西,只需要向下滚动页面来提取所有数据。
状态数据的长度只有150,而工作名称的长度是362。接下来的列,比如“由州许可、注册或认证”的数据,返回的长度只有62。我停止添加滚动代码,因为在遇到上述错误之前,我并没有抓取到很多数据。我保持div不变,因为页面的HTML结构是一样的。
如果有人能帮我理解我哪里出错了,我将非常感激。再次强调,我只是想抓取上面仪表板中的表格。
import time
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
option = webdriver.ChromeOptions()
option.add_argument("--start-maximized")
driver = webdriver.Chrome(options=option)
wait = WebDriverWait(driver, 10)
# Load the page
driver.get("https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection")
job_name_data_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Job Name"]')))
# Scroll down to the bottom of the page to load all the data
while True:
# Scroll down using JavaScript
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1) # Adjust sleep time according to your page load speed
# Check if we have reached the bottom of the page
if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):
break
# Extract the text from the Job Name data element after scrolling
job_name_data = job_name_data_element.text
## state
state_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="State"]')))
# Scroll down to the bottom of the page to load all the data
while True:
# Scroll down using JavaScript
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1) # Adjust sleep time according to your page load speed
# Check if we have reached the bottom of the page
if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):
break
state_data = state_column_element.text
## license
licensed_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Licensed"]')))
while True:
# Scroll down using JavaScript
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1) # Adjust sleep time according to your page load speed
# Check if we have reached the bottom of the page
if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):
break
licensed_data = licensed_column_element.text
len(licensed_data)
## education
education_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Education Requirement"]')))
while True:
# Scroll down using JavaScript
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1) # Adjust sleep time according to your page load speed
# Check if we have reached the bottom of the page
if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):
break
education_data = education_column_element.text
## training
training_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Amount of Training Required [In Hours]"]')))
training_data = training_column_element.text
## experience
experience_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Amount of Experience Required"]')))
experience_data = experience_column_element.text
## pro exam
exam_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Professional Exam"]')))
exam_data = exam_column_element.text
## renewal time
renewal_time_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Required Time of License Renewal (In Years)"]')))
renewal_time_data = renewal_time_column_element.text
## continious education
continious_education_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Continuing Education Requirement"]')))
continious_education_column_element_data = continious_education_column_element.text
## additional exams
additional_exams_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Additional Required Exams"]')))
additional_exams_column_element_data = additional_exams_column_element.text
## continious education
cost_of_licensure_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Cost of Initial Licensure (In Dollars)"]')))
cost_of_licensure_column_element_data = cost_of_licensure_column_element.text
## license renewal
license_renewal_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Cost of License Renewal (In Dollars)"]')))
license_renewal_column_element_data = license_renewal_column_element.text
## reciprocity
reciprocity_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Reciprocity or Endorsement"]')))
reciprocity_column_element_data = reciprocity_column_element.text
## character
character_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Good Moral Character Requirement"]')))
character_column_element_data = character_column_element.text
## blanket ban
ban_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Blanket Ban for Ex-Offenders"]')))
ban_column_element_data = ban_column_element.text
## rehab
rehab_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Rehabilitation Requirement"]')))
rehab_column_element_data = rehab_column_element.text
## rehab
rehab_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Rehabilitation Requirement"]')))
rehab_column_element_data = rehab_column_element.text
## relationship
relationship_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Relationship between Offense and Occupation"]')))
relationship_column_element_data = relationship_column_element.text
## Limitations
limitations_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Limitations on Scope of Inquiry"]')))
limitations_column_element_data = limitations_column_element.text
## age
age_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Minimum Age (In Years)"]')))
age_column_element_data = age_column_element.text
1 个回答
0
这里有几个问题...
- 你是在滚动整个页面,而不是只滚动包含你想要数据的表格。
- 当你水平或垂直滚动表格时,屏幕外的元素(行或列)实际上会从页面的结构中消失。
这会让抓取数据变得非常麻烦。
不过,我写了一些基础代码来帮你入门。这个代码甚至没有尝试去滚动... 只是提取了当前可见的内容(还有一些其他的)。它只看到了每行的前14个单元格和前20行。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
url = 'https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection'
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 10)
table = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[@role='document'][.//div[text()='Job Name']]")))
headers = table.find_elements(By.CSS_SELECTOR, "div[role='columnheader']")
headers.pop(0) # clean up first header
# print(len(headers))
h = []
for header in headers:
h.append(header.text.strip())
print(h)
rows = table.find_elements(By.CSS_SELECTOR, "div[role='row']")
rows.pop(0) # clean up empty row
for row in rows:
cells = row.find_elements(By.CSS_SELECTOR, "div[role='gridcell']")
if cells:
cells.pop(0) # clean up empty cell
# print(len(cells))
c = []
for cell in cells:
c.append(cell.text)
print(c)
输出结果
['Job Name', 'State', 'Licensed, Registered or Certified by State', 'Education Requirement', 'Amount of Training Required (In Hours)', 'Amount of Experience Required', 'Professional Exam', 'Required Time of License Renewal (In Years)', 'Continuing Education Requirement', 'Additional Required Exams', 'Cost of Initial Licensure (In Dollars)', 'Cost of License Renewal (In Dollars)', 'Reciprocity or Endorsement', 'Good Moral Character Requirement']
['Athletic Trainer', 'Alabama', 'Licensed', 'A bachelor’s degree is required (from an accredited academic institution or similarly recognized institution)', '0', '0', 'Yes, individuals must take an exam to attain licensure', '1', '26 hrs x 1 yr', '0', '505', '75', 'State does have statutory language allowing reciprocity or endorsement agreements', 'State does not have a “good moral character" clause']
... 以此类推