脚本解析索尼网页相机信息仅返回部分相机的信息

-1 投票
1 回答
35 浏览
提问于 2025-04-14 16:16
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from filtering import SonyPreview
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

# Initialize WebDriver
driver = webdriver.Chrome()


def fetch_urls():
    url = "https://electronics.sony.com/imaging/interchangeable-lens-cameras/c/all-interchangeable-lens-cameras?currentPage=2"
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "custom-product-grid-item__content")))
    camera_elements = driver.find_elements(By.CLASS_NAME,
                                           "custom-product-grid-item__content a.custom-product-grid-item__info")
    urls = [element.get_attribute('href') for element in camera_elements]
    return urls


def specs_see_more(driver_arg):
    wait = WebDriverWait(driver_arg, 10)
    # Click on the "Specifications" heading to reveal the "See More" button
    wait.until(EC.element_to_be_clickable((By.ID, "PDPSpecificationsLink"))).click()

    # Wait for the "See More" button to become clickable after the click action above
    wait.until(EC.element_to_be_clickable((By.XPATH, "(//button[contains(text(),'See More')])[2]"))).click()

    time.sleep(5)
    # Return the BeautifulSoup object of the page for parsing
    return BeautifulSoup(driver_arg.page_source, 'html.parser')


def parse_each_page(urls):
    for url in urls:
        driver.get(url)
        soup = specs_see_more(driver)
        # Returns Name
        name = soup.find('p').text if soup.find('p') else 'Name not found'
        # Returns Price
        price_div = soup.find('div', class_='custom-product-summary__price')
        price = price_div.find('span').text if price_div and price_div.find('span') else 'Price not found'
        # Returns Specs for each camera
        full_specs = soup.find_all('div', class_="full-specifications__specifications-single-card")

        temp = {}
        for full_spec in full_specs:
            keys = full_spec.find_all('h4', class_='full-specifications__specifications-single-card__sub-list__name')
            values = full_spec.find_all('p', class_='full-specifications__specifications-single-card__sub-list__value')
            spec_entries = zip(keys, values)
            for key, value in spec_entries:
                temp[key.text.strip()] = value.text.strip()

        # Creates Pydantic class of sony camera and saves it as json file
        sony_obj = {'name': name, 'price': price, 'specs': temp}
        sony_instance = SonyPreview(**sony_obj)
        sony_instance.save_json()


page_url = fetch_urls()
parse_each_page(page_url)
driver.quit()

这是我的脚本。如果你有任何优化的建议,我会很乐意接受。

我主要的问题是,它能返回一些摄像头的信息,但有些摄像头却没有返回规格;而且有时候它会出错:

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

我该怎么解决这个问题呢?

请给我一些建议,帮我优化这个代码,并解决我遇到的所有问题。

1 个回答

0
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from filtering import SonyPreview
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

# Initialize WebDriver
driver = webdriver.Chrome()
driver.maximize_window()


def fetch_urls():
    url = "https://electronics.sony.com/imaging/interchangeable-lens-cameras/c/all-interchangeable-lens-cameras?currentPage=2"
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "custom-product-grid-item__content")))
    camera_elements = driver.find_elements(By.CLASS_NAME,
                                           "custom-product-grid-item__content a.custom-product-grid-item__info")
    urls = [element.get_attribute('href') for element in camera_elements]
    return urls


def click_specifications(driver_arg, ID):
    WebDriverWait(driver_arg, 10).until(EC.element_to_be_clickable((By.ID, ID))).click()


def click_view_more(driver_arg, Xpath):
    WebDriverWait(driver_arg, 10).until(EC.element_to_be_clickable((By.XPATH, Xpath))).click()


def specs_see_more(driver_arg):
    # Click on the "Specifications" heading to reveal the "See More" button
    click_specifications(driver_arg, "PDPSpecificationsLink")
    # Wait for the "See More" button to become clickable after the click action above
    click_view_more(driver_arg, "(//button[contains(text(),'See More')])[2]")

    # Return the BeautifulSoup object of the page for parsing
    return BeautifulSoup(driver_arg.page_source, 'html.parser')


def parse_each_page(urls):
    for url in urls:
        driver.get(url)
        time.sleep(3)
        soup = specs_see_more(driver)
        time.sleep(3)
        # Returns Name
        name = soup.find('p').text if soup.find('p') else 'Name not found'
        # Returns Price
        price_div = soup.find('div', class_='custom-product-summary__price')
        price = price_div.find('span').text if price_div and price_div.find('span') else 'Price not found'
        # Returns Specs for each camera
        full_specs = soup.find_all('div', class_="full-specifications__specifications-single-card")

        temp = {}
        for full_spec in full_specs:
            keys = full_spec.find_all('h4', class_='full-specifications__specifications-single-card__sub-list__name')
            values = full_spec.find_all('p', class_='full-specifications__specifications-single-card__sub-list__value')
            spec_entries = zip(keys, values)
            for key, value in spec_entries:
                temp[key.text.strip()] = value.text.strip()

        # Creates Pydantic class of sony camera and saves it as json file
        sony_obj = {'name': name, 'price': price, 'specs': temp}
        sony_instance = SonyPreview(**sony_obj)
        sony_instance.save_json()


page_url = fetch_urls()
parse_each_page(page_url)
driver.quit()

我把点击预览和查看更多的操作放在了方法里,并添加了时间延迟的命令,这样可以让网页加载得更清楚。

撰写回答