脚本解析索尼网页相机信息仅返回部分相机的信息
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from filtering import SonyPreview
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
# Initialize WebDriver
driver = webdriver.Chrome()
def fetch_urls():
url = "https://electronics.sony.com/imaging/interchangeable-lens-cameras/c/all-interchangeable-lens-cameras?currentPage=2"
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "custom-product-grid-item__content")))
camera_elements = driver.find_elements(By.CLASS_NAME,
"custom-product-grid-item__content a.custom-product-grid-item__info")
urls = [element.get_attribute('href') for element in camera_elements]
return urls
def specs_see_more(driver_arg):
wait = WebDriverWait(driver_arg, 10)
# Click on the "Specifications" heading to reveal the "See More" button
wait.until(EC.element_to_be_clickable((By.ID, "PDPSpecificationsLink"))).click()
# Wait for the "See More" button to become clickable after the click action above
wait.until(EC.element_to_be_clickable((By.XPATH, "(//button[contains(text(),'See More')])[2]"))).click()
time.sleep(5)
# Return the BeautifulSoup object of the page for parsing
return BeautifulSoup(driver_arg.page_source, 'html.parser')
def parse_each_page(urls):
for url in urls:
driver.get(url)
soup = specs_see_more(driver)
# Returns Name
name = soup.find('p').text if soup.find('p') else 'Name not found'
# Returns Price
price_div = soup.find('div', class_='custom-product-summary__price')
price = price_div.find('span').text if price_div and price_div.find('span') else 'Price not found'
# Returns Specs for each camera
full_specs = soup.find_all('div', class_="full-specifications__specifications-single-card")
temp = {}
for full_spec in full_specs:
keys = full_spec.find_all('h4', class_='full-specifications__specifications-single-card__sub-list__name')
values = full_spec.find_all('p', class_='full-specifications__specifications-single-card__sub-list__value')
spec_entries = zip(keys, values)
for key, value in spec_entries:
temp[key.text.strip()] = value.text.strip()
# Creates Pydantic class of sony camera and saves it as json file
sony_obj = {'name': name, 'price': price, 'specs': temp}
sony_instance = SonyPreview(**sony_obj)
sony_instance.save_json()
page_url = fetch_urls()
parse_each_page(page_url)
driver.quit()
这是我的脚本。如果你有任何优化的建议,我会很乐意接受。
我主要的问题是,它能返回一些摄像头的信息,但有些摄像头却没有返回规格;而且有时候它会出错:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found
我该怎么解决这个问题呢?
请给我一些建议,帮我优化这个代码,并解决我遇到的所有问题。
1 个回答
0
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from filtering import SonyPreview
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
# Initialize WebDriver
driver = webdriver.Chrome()
driver.maximize_window()
def fetch_urls():
url = "https://electronics.sony.com/imaging/interchangeable-lens-cameras/c/all-interchangeable-lens-cameras?currentPage=2"
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "custom-product-grid-item__content")))
camera_elements = driver.find_elements(By.CLASS_NAME,
"custom-product-grid-item__content a.custom-product-grid-item__info")
urls = [element.get_attribute('href') for element in camera_elements]
return urls
def click_specifications(driver_arg, ID):
WebDriverWait(driver_arg, 10).until(EC.element_to_be_clickable((By.ID, ID))).click()
def click_view_more(driver_arg, Xpath):
WebDriverWait(driver_arg, 10).until(EC.element_to_be_clickable((By.XPATH, Xpath))).click()
def specs_see_more(driver_arg):
# Click on the "Specifications" heading to reveal the "See More" button
click_specifications(driver_arg, "PDPSpecificationsLink")
# Wait for the "See More" button to become clickable after the click action above
click_view_more(driver_arg, "(//button[contains(text(),'See More')])[2]")
# Return the BeautifulSoup object of the page for parsing
return BeautifulSoup(driver_arg.page_source, 'html.parser')
def parse_each_page(urls):
for url in urls:
driver.get(url)
time.sleep(3)
soup = specs_see_more(driver)
time.sleep(3)
# Returns Name
name = soup.find('p').text if soup.find('p') else 'Name not found'
# Returns Price
price_div = soup.find('div', class_='custom-product-summary__price')
price = price_div.find('span').text if price_div and price_div.find('span') else 'Price not found'
# Returns Specs for each camera
full_specs = soup.find_all('div', class_="full-specifications__specifications-single-card")
temp = {}
for full_spec in full_specs:
keys = full_spec.find_all('h4', class_='full-specifications__specifications-single-card__sub-list__name')
values = full_spec.find_all('p', class_='full-specifications__specifications-single-card__sub-list__value')
spec_entries = zip(keys, values)
for key, value in spec_entries:
temp[key.text.strip()] = value.text.strip()
# Creates Pydantic class of sony camera and saves it as json file
sony_obj = {'name': name, 'price': price, 'specs': temp}
sony_instance = SonyPreview(**sony_obj)
sony_instance.save_json()
page_url = fetch_urls()
parse_each_page(page_url)
driver.quit()
我把点击预览和查看更多的操作放在了方法里,并添加了时间延迟的命令,这样可以让网页加载得更清楚。