如何在纽约时报餐饮网站上加速硒刮刀?

2024-06-10 10:16:11 发布

您现在位置:Python中文网/ 问答频道 /正文

目前,我正试图浏览《纽约时报》餐饮网站(nytimes.com/reviews/dinging),获取每家餐厅的链接和社区列表。不幸的是,我已经在GoogleColab上运行下面的代码块大约9个小时了,迭代器X已经运行了1175次。我正试图弄清楚到底发生了什么,但我太害怕了,不敢停下来重新开始。是嵌套for循环导致此过程花费如此长的时间吗

driver = webdriver.Chrome('chromedriver', chrome_options = chrome_options)
driver.get("https://www.nytimes.com/reviews/dining")

WebDriverWait(driver, 20).until(EC.element_to_be_clickable
                                ((By.XPATH,"//button[text()='Show More']"))).click()
url_list = []
nyt_dining = pd.DataFrame(columns = ['Restaurant', 'URL', 'servesCuisine', 'priceRange', 'addressLocality'])

x = 0 
while(True):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    elements = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located
                                             ((By.CSS_SELECTOR,"div.css-102xbk1")))
    url_before = len(url_list)
    for e in elements:
      # 2.1 Getting the links
      link = e.find_element_by_css_selector("a.css-gg4vpm")
      link = link.get_attribute("href") ##

      # 2.1 Getting the restaurant name
      name = e.find_element_by_css_selector("h2.css-8aqwnr")
      name = name.text ##

      # 2.3 Getting other information
      info = e.find_element_by_css_selector("ul.css-o4kdzz")
      info = info.find_elements_by_tag_name('li')
      cuisine = ''
      price = ''
      neighborhood = ''
      for i in info:
        attribute = i.get_attribute("itemprop")
        if attribute == "servesCuisine":
          cuisine = i.text
        elif attribute == "priceRange":
          price = i.text
        elif attribute == "addressLocality":
          neighborhood = i.text

      # 2.4 Append to dataframe
      if link in url_list:
        continue
      else:
        url_list.append(link)
        nyt_dining = nyt_dining.append({'Restaurant': name, 'URL': link,
                                        'servesCuisine': cuisine,
                                        'priceRange': price,
                                        'addressLocality': neighborhood}, 
                                       ignore_index = True)
        print(x)
        x += 1
    url_after = len(url_list)
    if url_before >= url_after:
      break
    button = WebDriverWait(driver, 10).until(EC.visibility_of_element_located
                                             ((By.XPATH,"//button[text()='Show More']")))
    driver.execute_script("arguments[0].click();", button)
    time.sleep(2)

nyt_dining

Tags: textnameurlbydriverlinkattributebutton