如何在易趣中迭代页面

class EbayScraper(object): def __init__(self, item, buying_type): self.base_url = "https://www.ebay.com/sch/i.html?_nkw=" self.driver = webdriver.Chrome(r"chromedriver.exe") self.item = item self.buying_type = buying_type + "=1" self.url_seperator = "&_sop=12&rt=nc&LH_" self.url_seperator2 = "&_pgn=" self.page_num = "1" def getPageUrl(self): if self.buying_type == "Buy It Now=1": self.buying_type = "BIN=1" self.item = self.item.replace(" ", "+") url = self.base_url + self.item + self.url_seperator + self.buying_type + self.url_seperator2 + self.page_num response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') return soup def getInfo(self, soup): for listing in soup.find_all("li", {"class": "s-item"}): raw = listing.find_all("a", {"class": "s-item__link"}) if raw: raw_price = listing.find_all("span", {"class": "s-item__price"})[0] raw_title = listing.find_all("h3", {"class": "s-item__title"})[0] raw_link = listing.find_all("a", {"class": "s-item__link"})[0] raw_condition = listing.find_all("span", {"class": "SECONDARY_INFO"})[0] condition = raw_condition.text price = float(raw_price.text[1:]) title = raw_title.text link = raw_link['href'] print(title) print(condition) print(price) if self.buying_type != "BIN=1": raw_time_left = listing.find_all("span", {"class": "s-item__time-left"})[0] time_left = raw_time_left.text[:-4] print(time_left) print(link) print('\n') if __name__ == '__main__': item = input("Item: ") buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ") instance = EbayScraper(item, buying_type) page = instance.getPageUrl() instance.getInfo(page)

1条回答

网友

1楼 · 发布于 2024-04-25 08:18:19

如果要迭代所有页面并收集所有结果，则脚本需要在访问该页面后检查是否存在next页面

import requests
from bs4 import BeautifulSoup


class EbayScraper(object):

    def __init__(self, item, buying_type):
        ...
        self.currentPage = 1

    def get_url(self, page=1):
        if self.buying_type == "Buy It Now=1":
            self.buying_type = "BIN=1"

        self.item = self.item.replace(" ", "+")
        # _ipg=200 means that expect a 200 items per page
        return '{}{}{}{}{}{}&_ipg=200'.format(
            self.base_url, self.item, self.url_seperator, self.buying_type,
            self.url_seperator2, page
        )

    def page_has_next(self, soup):
        container = soup.find('ol', 'x-pagination__ol')
        currentPage = container.find('li', 'x-pagination__li selected')
        next_sibling = currentPage.next_sibling
        if next_sibling is None:
            print(container)
        return next_sibling is not None

    def iterate_page(self):
        # this will loop if there are more pages otherwise end
        while True:
            page = instance.getPageUrl(self.currentPage)
            instance.getInfo(page)
            if self.page_has_next(page) is False:
                break
            else:
                self.currentPage += 1

    def getPageUrl(self, pageNum):
        url = self.get_url(pageNum)
        print('page: ', url)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup

    def getInfo(self, soup):
        ...


if __name__ == '__main__':
    item = input("Item: ")
    buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ")

    instance = EbayScraper(item, buying_type)
    instance.iterate_page()

这里的重要函数是page_has_next和iterate_page

page_has_next-一个函数，用于检查页面的分页是否在selected页面旁边有另一个li元素。e、 g< 1 2 3 >如果我们在第1页，那么它会检查是否有2个next->；类似这样的东西
iterate_page-循环直到没有page_next

另外请注意，除非您需要模拟用户点击或需要浏览器导航，否则不需要selenium

相关问题更多 >

编程相关推荐

热门问题

热门文章