如何使用Selenium和Python从分页中的所有页面中刮取所有25个漫画

from selenium import webdriver from bs4 import BeautifulSoup from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import time class Scraper(): comics_url = "https://www.dccomics.com/comics" driver = webdriver.Chrome("C:\\laragon\\www\\Proftaak\\chromedriver.exe") # driver = webdriver.Chrome("C:\\laragon\\www\\proftaak-2020\\Proftaak-scraper\\chromedriver.exe") driver.get(comics_url) driver.implicitly_wait(500) current_page = 2 def GoToComic(self): for i in range(1,3): time.sleep(2) goToComic = self.driver.find_element_by_xpath(f'//*[@id="dcbrowseapp"]/div/div/div/div[3]/div[3]/div[2]/div[{i}]/a/img') self.driver.execute_script("arguments[0].click();", goToComic) self.ScrapeComic() self.driver.back() self.ClearFilter() if self.current_page != 6: if i == 25: self.current_page +=1 self.ToNextPage() def ScrapeComic(self): self.driver.implicitly_wait(250) title = [my_elem.text for my_elem in WebDriverWait(self.driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'page-title')]")))] price = [my_elem.text for my_elem in WebDriverWait(self.driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'buy-container-price')]/span[contains(@class, 'price')]")))] available = [my_elem.text for my_elem in WebDriverWait(self.driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'sale-status-container')]/span[contains(@class, 'sale-status')]")))] try: description = [my_elem.text for my_elem in WebDriverWait(self.driver, 5).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "field-items")))] except: return def ToNextPage(self): if self.current_page != 6: nextPage = self.driver.find_element_by_xpath(f'//*[@id="dcbrowseapp"]/div/div/div/div[3]/div[3]/div[3]/div[1]/ul/li[{self.current_page}]/a') self.driver.execute_script("arguments[0].click();", nextPage) self.GoToComic() def AcceptCookies(self): self.driver.implicitly_wait(250) cookies = self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[4]/div[2]/div/button') self.driver.execute_script("arguments[0].click();", cookies) self.driver.implicitly_wait(100) def ClearFilter(self): self.driver.implicitly_wait(500) clear_filter = self.driver.find_element_by_class_name('clear-all-action') self.driver.execute_script("arguments[0].click();", clear_filter) def QuitDriver(self): self.driver.quit() scraper = Scraper() scraper.AcceptCookies() scraper.ClearFilter() scraper.GoToComic() scraper.QuitDriver()

2条回答

网友

1楼 · 编辑于 2024-06-07 12:22:40

网页https://www.dccomics.com/comics中的浏览漫画部分没有5页面作为分页，但只有3页面。要使用Selenium和python从每个漫画中刮取名称，您必须为visibility_of_all_elements_located()归纳WebDriverWait，并且可以使用以下基于xpath的Locator Strategies：

代码块：

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException

options = webdriver.ChromeOptions() 
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get('https://www.dccomics.com/comics')
while True:
    try:
        time.sleep(5)
        print([my_elem.text for my_elem in WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'browse-result')]/a//p[not(contains(@class, 'result-date'))]")))])
        WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//ul[@class='pagination']//li[@class='active']//following::li[1]/a"))).click()
        print("Navigating to the next page")
    except (TimeoutException, ElementClickInterceptedException):
        print("No more pages to browse")
        break
driver.quit()

控制台输出：

['PRIMER', 'DOOMSDAY CLOCK PART 2', 'CATWOMAN #22', 'ACTION COMICS #1022', 'BATMAN/SUPERMAN #9', 'BATMAN: GOTHAM NIGHTS #7', 'BATMAN: THE ADVENTURES CONTINUE #5', 'BIRDS OF PREY #1', 'CATWOMAN 80TH ANNIVERSARY 100-PAGE SUPER SPECTACULAR #1', 'DC GOES TO WAR', "DCEASED: HOPE AT WORLD'S END #2", 'DETECTIVE COMICS #1022', 'FAR SECTOR #6', "HARLEY QUINN: MAKE 'EM LAUGH #1", 'HOUSE OF WHISPERS #21', 'JOHN CONSTANTINE: HELLBLAZER #6', 'JUSTICE LEAGUE DARK #22', 'MARTIAN MANHUNTER: IDENTITY', 'SCOOBY-DOO, WHERE ARE YOU? #104', 'SHAZAM! #12', 'TEEN TITANS GO! TO CAMP #15', 'THE JOKER: 80 YEARS OF THE CLOWN PRINCE OF CRIME THE DELUXE EDITION', 'THE LAST GOD: TALES FROM THE BOOK OF AGES #1', 'THE TERRIFICS VOL. 3: THE GOD GAME', 'WONDER WOMAN #756']
Navigating to the next page
['YOUNG JUSTICE VOL. 2: LOST IN THE MULTIVERSE', 'AMETHYST #3', 'BATMAN #92', 'DC CLASSICS: THE BATMAN ADVENTURES #1', 'DC COMICS: THE ASTONISHING ART OF AMANDA CONNER', 'DIAL H FOR HERO VOL. 2: NEW HEROES OF METROPOLIS', 'HARLEY QUINN #73', "HARLEY QUINN: MAKE 'EM LAUGH #2", 'JUSTICE LEAGUE #46', 'JUSTICE LEAGUE ODYSSEY #21', 'LEGION OF SUPER-HEROES #6', 'LOIS LANE #11', 'NIGHTWING #71', 'TEEN TITANS GO! TO CAMP #16', "THE BATMAN'S GRAVE #7", 'THE FLASH #755', 'THE FLASH VOL. 12: DEATH AND THE SPEED FORCE', 'THE JOKER 80TH ANNIVERSARY 100-PAGE SUPER SPECTACULAR #1', 'YEAR OF THE VILLAIN: HELL ARISEN', 'YOUNG JUSTICE #15', 'SUPERMAN #22', 'BATMAN SECRET FILES #3', 'WONDER WOMAN: TEMPEST TOSSED', 'HAWKMAN #24', 'JOKER: THE DELUXE EDITION']
Navigating to the next page
['METAL MEN #7', 'NIGHTWING ANNUAL #3', 'BATGIRL VOL. 7: ORACLE RISING', 'BATMAN & THE OUTSIDERS #13', 'BATMAN: GOTHAM NIGHTS #9', 'CATWOMAN VOL. 3: FRIEND OR FOE?', 'DAPHNE BYRNE #5', "DCEASED: HOPE AT WORLD'S END #3", 'STRANGE ADVENTURES #2', 'THE FLASH ANNUAL (REBIRTH) #3', 'THE GREEN LANTERN SEASON TWO #4', 'THE QUESTION: THE DEATHS OF VIC SAGE #3', 'WONDER WOMAN #757', 'WONDER WOMAN: AGENT OF PEACE #6', 'WONDER WOMAN: DEAD EARTH #3', 'DARK NIGHTS: DEATH METAL #1', 'YOU BROUGHT ME THE OCEAN']
No more pages to browse

网友

2楼 · 编辑于 2024-06-07 12:22:40

浏览器back功能可将您带到以前访问过的URL。在您提到的网站中，所有页面都使用单个URL（看起来它们是由JS加载到同一页面的，因此新的漫画页面不需要新的URL）

这就是为什么当您从第二页的第一个漫画返回时，您只需重新加载https://www.dccomics.com/comics，其中第一页作为默认页面加载

我还可以看到，没有专门的控制从喜剧细节回到名单

因此，唯一的方法是将当前页面的编号存储在代码中的某个位置，然后在从漫画细节页面返回后切换到该具体编号

相关问题更多 >

编程相关推荐

热门问题

热门文章