多进程进程池挂起连接

2024-04-25 20:34:53 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在写一个scraper来快速从google图片下载图片。 为了确保这个操作在处理大量图像时足够快,我使用了Python的multiprocessing包。你知道吗

URL列表的每个成员都被传递给下载图像的函数。所有这些都是在使用apply_async的进程Pool中完成的。 但是,例如,当下载200个图像时,池将永远挂起在~197/200处,并且永远不会joins

这是第行的代码问题。我尽量少写一个例子。滚动功能是必要的获得>;100图像下载,否则脚本工作正常。你知道吗

from selenium import webdriver
from six.moves import urllib
from multiprocessing import Pool
import tqdm
import time
import json
import sys
import os

# Config
download_img_path = "test/"
req_header = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"


def write_img_file(img_item):
    req = urllib.request.Request(img_item)
    req.add_header('User-Agent', req_header)
    urllib.request.urlopen(req)
    return


def get_images(driver, folder_path, num):
    images = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')
    img_list = [json.loads(images[i].get_attribute('innerHTML'))["ou"] for i in range(min(len(images), num))]

    pbar = tqdm.tqdm(total=len(img_list))
    def update(*a):
        pbar.update()

    pool = Pool()
    for i in range(pbar.total):
        pool.apply_async(write_img_file, args=(img_list[i], ), callback=update)
    pool.close()
    pool.join()
    del pool

def scroll(driver, num_scrolls):

    for _ in range(num_scrolls):
        for __ in range(10):
            # scrolls to show all 400 images
            driver.execute_script("window.scrollBy(0, 1000000)")
            time.sleep(0.2)
        # click "show more results"
        time.sleep(0.5)
        try:
            driver.find_element_by_xpath(
                "//input[@value='Plus de résultats']") .click()
            time.sleep(0.5)
        except Exception as e:
            print("    show more results failed -> exception: " + str(e))


def search(search_txt, num):

    if not os.path.exists(download_img_path):
        os.makedirs(download_img_path)

    url = "https://www.google.co.in/search?q=" + \
        search_txt + "&source=lnms&tbm=isch"
    driver = webdriver.Chrome(
        executable_path=r"/usr/lib/chromium/chromedriver")
    driver.get(url)
    num_scrolls = int(num / 400 + 1)
    scroll(driver, num_scrolls)

    get_images(driver, download_img_path, num)


search("hotdog", 200)

print('DONE')
sys.exit()


Tags: pathin图像importimgsearchtimedownload