使用Selenium和Python进行分页抓取
我正在练习网页抓取,写了一个脚本,可以访问我个人的Coursera网站(我在他们的"robots.txt"文件里没看到有什么禁止的内容,所以我觉得这样做是可以的,因为这是我自己的信息)。不过主要的问题是,当出现验证码时,我还是得手动解决,但除此之外,抓取首页的文本是可以正常工作的。不过,我这边有很多页面的数据需要抓取,现在在处理分页时遇到了一些问题。有没有人能给点建议?我觉得问题可能出在我的def get_pages(self):
这个方法上。我在里面加的打印语句显示它到了第2页,但似乎在真正运行之前就结束了……
我的代码是:
import time
import secret
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
'''
Code text
'''
template = 'Certificate Name: {}\nCredential ID: {}\n'
class CourseraScraper:
'''
Class variables
'''
def __init__ (self, url: str, username: str, password: str):
# options = webdriver.FirefoxOptions()
# options.add_argument('--headless')
self.url = url
self.username = username
self.password = password
self.browser = webdriver.Firefox()
self.page_counter = 1
self.selector_counter = 2
def login(self):
'''
Using login info saved in secret.py, login using CSS_SELECTOR
'''
self.browser.get(self.url)
print('Accessing browser URL...')
username_css_selector = self.browser.find_element(By.CSS_SELECTOR, '#email')
username_css_selector.send_keys(self.username)
print('Entering username...')
pwd_css_selector = self.browser.find_element(By.CSS_SELECTOR, '#password')
pwd_css_selector.send_keys(self.password)
print('Entering password...')
login_button = self.browser.find_element(By.CSS_SELECTOR, 'button[type="submit"]')
login_button.click()
print('Clicking login button...')
time.sleep(60)
print('Time delay for CAPTCHA...')
def click_accomplishments(self):
'''
Once logged in, pause (there is a box element that sometimes obscures the
the drop down so the pause should allow the element to become visible) then click
"Accomplishments" from menu.
'''
print('Finding "Accomplishments" from drop-down...')
select_dropdown = self.browser.find_element(By.CSS_SELECTOR, '.cds-Avatar-initial')
select_dropdown.click()
print('Clicking link...')
accomplishments_link = self.browser.find_element(By.CSS_SELECTOR, 'li.dropdown-btn:nth-child(6) > a:nth-child(1)')
accomplishments_link.click()
time.sleep(5)
def get_pages(self):
'''
This is the problem area - trying to select page, then scrape - but it does not proceed to the 2nd page.
'''
n = self.page_counter
pages = self.browser.find_elements(By.XPATH, f'//*[@id="pagination_number_box_{n}"]')
for page in pages:
print(f'Starting on page {n}')
self.scrape_page()
n = self.counter_increment()
print(f'Total pages scraped: {n}')
print('Process complete!')
def counter_increment(self):
'''
The page count is initialized to 1, then increment n to the next page(s) number
'''
self.page_counter += 1
return self.page_counter
def selector_increment(self):
'''
Box selector starts at n = 2, then increment for each one encountered
'''
self.selector_counter += 1
return self.selector_counter
def scrape_page(self):
'''
Scrape specific element using CSS_SELECTOR and return the text to the function write_text_to_file() to be written to .txt file.
'''
n = self.selector_counter
while True:
tag_css_selector = f'div.rc-AccomplishmentCard:nth-child({n}) > div:nth-child(1) > div:nth-child(2) > a:nth-child(1)'
try:
accomplishment = self.browser.find_element(By.CSS_SELECTOR, tag_css_selector)
h3_text = accomplishment.find_element(By.TAG_NAME, 'h3').text
to_field = accomplishment.get_attribute('to')
to_field_text_split = to_field.rsplit('/', 1)[-1]
formatted_text = template.format(h3_text, to_field_text_split)
print(formatted_text)
self.write_text_to_file(formatted_text)
n = self.selector_increment()
except NoSuchElementException as e:
print(f'Reached end of elements.')
break
except Exception as e:
print(f'An error occurred: {e}')
break
def write_text_to_file(self, write_text: str):
'''
Write values to file
'''
with open('div_elements.txt', 'a+') as f:
f.write(write_text + '\n')
def close(self):
'''
Close browser session.
'''
self.browser.close()
print('All done!')
if __name__ == '__main__':
'''
Class calls
'''
url = secret.url
username = secret.username
password = secret.password
browser_session = CourseraScraper(url, username, password)
browser_session.login()
browser_session.click_accomplishments()
browser_session.get_pages()
browser_session.close()
我尝试过根据ChatGPT的建议重新调整这个方法,但到目前为止都没有成功。
我希望这个方法能顺利跳到第2页(如果有多页的话,能继续跳下去),并把抓取到的数据添加到.txt文件里。
1 个回答
-1
用Selenium抓取多个页面的基本思路是使用 .click() 方法来点击“下一页”按钮,直到这个按钮变成不可点击的状态为止。你可以用循环和异常处理来判断,当“下一页”按钮不可点击时,就跳出循环。下面是一个示例代码:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
link = 'https://www.ebay.com/b/TV-Video-Home-Audio-Electronics/32852/bn_1648392?rt=nc&_pgn=1'
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service((ChromeDriverManager().install())), options=options)
driver.get(link)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//ol[@class='pagination__items']/li/a[@aria-current='page']")))
while True:
try:
# Searching for current page
pno = driver.find_element(By.XPATH, "//ol[@class='pagination__items']/li/a[@aria-current='page']").get_attribute("textContent")
# Printing current page
print("Page no: ", pno)
# Check if next page is not disabled
driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[@class='pagination__next icon-link']"))))
# If next page not disabled, go to next page
driver.find_element(By.XPATH, "//a[@class='pagination__next icon-link']").click()
except (WebDriverException, TimeoutException) as err:
print("End")
break
driver.quit()