用BeautifulSoup从页面中刮取所有结果

import requests from bs4 import BeautifulSoup as bs import os url = 'https://www.pexels.com/search/cute%20kittens/' page = requests.get(url) soup = bs(page.text, 'html.parser') image_tags = soup.findAll('img') if not os.path.exists('kittens'): os.makedirs('kittens') os.chdir('kittens') x = 0 for image in image_tags: try: url = image['src'] source = requests.get(url) if source.status_code == 200: with open('kitten-' + str(x) + '.jpg', 'wb') as f: f.write(requests.get(url).content) f.close() x += 1 except: pass

1条回答

网友

1楼 · 发布于 2024-04-20 09:14:47

由于站点是动态的，您需要使用浏览器操作工具，如selenium：

from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
import os
driver = webdriver.Chrome()
driver.get('https://www.pexels.com/search/cute%20kittens/')
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  time.sleep(0.5)
  new_height = driver.execute_script("return document.body.scrollHeight")
  if new_height == last_height:
     break
  last_height = new_height

image_urls = [i['src'] for i in soup(driver.page_source, 'html.parser').find_all('img')]
if not os.path.exists('kittens'):
  os.makedirs('kittens')
os.chdir('kittens')
with open('kittens.txt') as f:
  for url in image_urls:
    f.write('{}\n'.format(url))

相关问题更多 >

编程相关推荐

热门问题

热门文章