使用Selenium和Python进行分页抓取

-2 投票
1 回答
28 浏览
提问于 2025-04-12 04:09

我正在练习网页抓取,写了一个脚本,可以访问我个人的Coursera网站(我在他们的"robots.txt"文件里没看到有什么禁止的内容,所以我觉得这样做是可以的,因为这是我自己的信息)。不过主要的问题是,当出现验证码时,我还是得手动解决,但除此之外,抓取首页的文本是可以正常工作的。不过,我这边有很多页面的数据需要抓取,现在在处理分页时遇到了一些问题。有没有人能给点建议?我觉得问题可能出在我的def get_pages(self):这个方法上。我在里面加的打印语句显示它到了第2页,但似乎在真正运行之前就结束了……

我的代码是:


import time
import secret
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

'''
Code text
'''

template = 'Certificate Name: {}\nCredential ID: {}\n'

class CourseraScraper:
    '''
    Class variables
    '''
    def __init__ (self, url: str, username: str, password: str):
        # options = webdriver.FirefoxOptions()
        # options.add_argument('--headless')
        self.url = url
        self.username = username
        self.password = password
        self.browser = webdriver.Firefox()
        self.page_counter = 1
        self.selector_counter = 2

    def login(self):
        '''
        Using login info saved in secret.py, login using CSS_SELECTOR
        '''
        self.browser.get(self.url)
        print('Accessing browser URL...')
        username_css_selector = self.browser.find_element(By.CSS_SELECTOR, '#email')
        username_css_selector.send_keys(self.username)
        print('Entering username...')
        
        pwd_css_selector = self.browser.find_element(By.CSS_SELECTOR, '#password')
        pwd_css_selector.send_keys(self.password)
        print('Entering password...')
        
        login_button = self.browser.find_element(By.CSS_SELECTOR, 'button[type="submit"]')
        login_button.click()
        print('Clicking login button...')
        time.sleep(60)
        print('Time delay for CAPTCHA...')


    def click_accomplishments(self):
        '''
        Once logged in, pause (there is a box element that sometimes obscures the
the drop down so the pause should allow the element to become visible) then click
"Accomplishments" from menu.
        '''
        print('Finding "Accomplishments" from drop-down...')
        select_dropdown = self.browser.find_element(By.CSS_SELECTOR, '.cds-Avatar-initial')
        select_dropdown.click()

        print('Clicking link...')
        accomplishments_link = self.browser.find_element(By.CSS_SELECTOR, 'li.dropdown-btn:nth-child(6) > a:nth-child(1)')
        accomplishments_link.click()
        
        time.sleep(5)


    def get_pages(self):
        '''
        This is the problem area - trying to select page, then scrape - but it does not proceed to the 2nd page.
        '''
        n = self.page_counter
        pages = self.browser.find_elements(By.XPATH, f'//*[@id="pagination_number_box_{n}"]')
        for page in pages:
            print(f'Starting on page {n}')
            self.scrape_page()
            n = self.counter_increment()
        print(f'Total pages scraped: {n}')
        print('Process complete!')


    def counter_increment(self):
        '''
        The page count is initialized to 1, then increment n to the next page(s) number
        '''
        self.page_counter += 1
        return self.page_counter


    def selector_increment(self):
        '''
        Box selector starts at n = 2, then increment for each one encountered
        '''
        self.selector_counter += 1
        return self.selector_counter


    def scrape_page(self):
        '''
        Scrape specific element using CSS_SELECTOR and return the text to the function write_text_to_file() to be written to .txt file.
        '''
        n = self.selector_counter
        
        while True:
            tag_css_selector = f'div.rc-AccomplishmentCard:nth-child({n}) > div:nth-child(1) > div:nth-child(2) > a:nth-child(1)'
            try:
                accomplishment = self.browser.find_element(By.CSS_SELECTOR, tag_css_selector)
                h3_text = accomplishment.find_element(By.TAG_NAME, 'h3').text
                to_field = accomplishment.get_attribute('to')
                to_field_text_split = to_field.rsplit('/', 1)[-1]

                formatted_text = template.format(h3_text, to_field_text_split)
                print(formatted_text)
                self.write_text_to_file(formatted_text)
                n = self.selector_increment()

            except NoSuchElementException as e:
                print(f'Reached end of elements.')
                break

            except Exception as e:
                print(f'An error occurred: {e}')
                break


    def write_text_to_file(self, write_text: str):
        '''
        Write values to file
        '''
        with open('div_elements.txt', 'a+') as f:
            f.write(write_text + '\n')


    def close(self):
        '''
        Close browser session.
        '''
        self.browser.close()
        print('All done!')



if __name__ == '__main__':
    
    '''
    Class calls
    '''
    url = secret.url
    username = secret.username
    password = secret.password

    browser_session = CourseraScraper(url, username, password)

    browser_session.login()

    browser_session.click_accomplishments()

    browser_session.get_pages()

    browser_session.close()


我尝试过根据ChatGPT的建议重新调整这个方法,但到目前为止都没有成功。

我希望这个方法能顺利跳到第2页(如果有多页的话,能继续跳下去),并把抓取到的数据添加到.txt文件里。

1 个回答

-1

用Selenium抓取多个页面的基本思路是使用 .click() 方法来点击“下一页”按钮,直到这个按钮变成不可点击的状态为止。你可以用循环和异常处理来判断,当“下一页”按钮不可点击时,就跳出循环。下面是一个示例代码:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

link = 'https://www.ebay.com/b/TV-Video-Home-Audio-Electronics/32852/bn_1648392?rt=nc&_pgn=1'
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service((ChromeDriverManager().install())), options=options)
driver.get(link)

WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//ol[@class='pagination__items']/li/a[@aria-current='page']")))

while True:
    try:
        # Searching for current page
        pno = driver.find_element(By.XPATH, "//ol[@class='pagination__items']/li/a[@aria-current='page']").get_attribute("textContent") 
        # Printing current page
        print("Page no: ", pno) 
        # Check if next page is not disabled
        driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@class='pagination__next icon-link']"))))
        # If next page not disabled, go to next page
        driver.find_element(By.XPATH, "//a[@class='pagination__next icon-link']").click()
    except (WebDriverException, TimeoutException) as err:
        print("End")
        break
driver.quit()

撰写回答