如何使用Selenium从unsplash获取所有下载链接?

2024-04-28 20:23:01 发布

您现在位置:Python中文网/ 问答频道 /正文

正在尝试从Unsplash下载图像集合

当我尝试查看len(链接)时,我只得到29,而它应该是63

不确定问题是什么:

from selenium import webdriver


def driver_download(location_for_download):
    # options = Options()
    # options.headless = True
    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': location_for_download}
    chrome_options.add_experimental_option('prefs', prefs)
    # driver = webdriver.Chrome(chrome_options=chrome_options)
    driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
    return driver

url = 'https://unsplash.com/collections/10927848/thestockmarketinvestor'

driver = driver_download('/home/xxx/Documents/xxxxx/pictures_from_unsplash/')

#I have clicked Load more images, all images are showing on page.

driver.get(url)

x = driver.find_elements_by_tag_name('a')

count = 0

for i in x:
    if i.get_attribute('title') == 'Download photo':
        count+=1

我试着滚动到页面底部和中间。结果仍然相同


Tags: fromurlforgetdownloaddrivercountlocation
2条回答

所以我做了更多的工作,下面是工作脚本

这不是最好的办法

有一个步骤仍然需要用户单击。这能自动化吗

import os
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys


def driver_download(location_for_download):
    # options = Options()
    # options.headless = True
    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': location_for_download}
    chrome_options.add_experimental_option('prefs', prefs)
    # driver = webdriver.Chrome(chrome_options=chrome_options)
    driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", chrome_options=chrome_options)
    return driver


def get_picture_links(url, location):
    # Check if location exists, if not create the location
    if os.path.isdir(location) == False:
        os.mkdir(location)
    driver = driver_download()
    driver.maximize_window()

    driver.get(url)

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2)")
    count = 0
    links = []
    for i in range(7):
        if count == 0:
            time.sleep(4)
            x = driver.find_elements_by_tag_name('a')
            for i in x:
                if i.get_attribute('title') == 'Download photo':
                    links.append(i.get_attribute('href'))
            count += 1
        else:
            if count == 1:
                # Click the button
                time.sleep(4)
                input('Please click Load More Photos')
            body = driver.find_element_by_css_selector('body')
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(5)
            x = driver.find_elements_by_tag_name('a')
            for i in x:
                if i.get_attribute('title') == 'Download photo':
                    links.append(i.get_attribute('href'))
            count += 1
    links = list(set(links))
    print('Found: %s Pictures to Download.' % str(len(links)))
    driver.quit()
    return links


def get_pictures(location):
    print('Downloading....{} files, it should take around {} seconds'.format(len(links), len(links) * 4))
    driver = driver_download(location)
    for link in links:
        time.sleep(4)
        driver.get(link)
    time.sleep(20)
    driver.quit()
    print('Pictures have been downloaded..Renaming now')


def rename_pictures(location):
    # Rename the files
    os.chdir(location)
    files = os.listdir()
    files = [i for i in files if '.jpg' or '.jpeg' in i]

    count = 1

    for i in files:
        os.rename(i, str(count) + '.jpg')
        count += 1
    print('Everything done! Open the folder to see the files')

location = 'Blah'
url = 'https://unsplash.com/xxxx/xxxx' # Change to the required url
links = get_picture_links(url=url, location=location)
# Download the files
get_pictures(location=location)
# Rename the files
rename_pictures(location=location)

该网站使用GET方法获取每10张图片的JSON数据。我不熟悉Python,但我会给你一个R脚本,让你翻译成Python。不过,这个站点不需要Selenium

library(rvest)
library(stringr)
library(rjson)

all_links <- character()
for (i in 1:7) {
  url = str_c("https://unsplash.com/napi/collections/10927848/photos?page=", i, "&per_page=10&order_by=latest")
  pg <- fromJSON(file = url)
  links <- character()
  for (j in 1:length(pg)) links[j] <- pg[[j]]$links$download[1]
  
  all_links <- c(all_links, links)
}

基本上,这个想法是获得JSON文件,下载链接将位于每个项目的$link$download节点

相关问题 更多 >