Python多处理.dummy套接字下载图像

2024-05-14 00:40:35 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在使用Python套接字从一个网站下载图像。我对使用请求或urllib库不感兴趣。我想用线程加速这个过程。我用过多处理.dummy我以前在图书馆工作过,它通常对我有用。在这种情况下,它是非常不可预测的。当它应该下载53张图片(见下面的响应头)时,它通常会下载38到44张图片。我已经统计了响应头消息的数量,其中有53条。这似乎表明我应该收到53张照片。然而,我的代码每次保存的图像不到53个。有人能认出我在哪里用线弄错了吗?我下载了所有的图像没有利用线程,它工作得很好。这使我相信这是我的线程实现的一个问题。下面是我的脚本的一个运行示例。你知道吗

PS D:\Documents\School\RIT\Classes\Summer 2018\CSEC 380\Homework\3\Script> python .\hw3-script.py
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xAcharya.jpg.pagespeed.ic.dQLJ0KfusA.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png


MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xAbuaitah.jpg.pagespeed.ic.PFwk87Pcno.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png


MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/xCSEC.png.pagespeed.ic.Ep0KUkS94M.png HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png


MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xJake,P20Brown.jpg.pagespeed.ic.KvGLjuuU03.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png

<...many more MESSAGE SENT and RESPONSE HEADERS...>

RESPONSE HEADERS
HTTP/1.1 200 OK
Date: Sun, 12 Aug 2018 15:19:44 GMT
Server: Apache
Link: <http://www.rit.edu/gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/CSEC.png>; rel="canonical"
X-Content-Type-Options: nosniff
Accept-Ranges: bytes
Expires: Mon, 12 Aug 2019 12:51:06 GMT
Cache-Control: max-age=31536000
Etag: W/"0"
Last-Modified: Sun, 12 Aug 2018 12:51:06 GMT
X-Original-Content-Length: 13647
Content-Length: 10131
Connection: close
Content-Type: image/png

RESPONSE HEADERS
HTTP/1.1 200 OK
Date: Sun, 12 Aug 2018 15:19:44 GMT
Server: Apache
Link: <http://www.rit.edu/gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/footer-logo.png>; rel="canonical"
X-Content-Type-Options: nosniff
Accept-Ranges: bytes
Expires: Mon, 12 Aug 2019 14:09:08 GMT
Cache-Control: max-age=31536000
Etag: W/"0"
Last-Modified: Sun, 12 Aug 2018 14:09:08 GMT
X-Original-Content-Length: 19921
Content-Length: 16125
Connection: close
Content-Type: image/png

Number of image urls: 53
Number of files downloaded: 42
Time elapsed: 0:00:04.019662

我的代码:

import sys
import socket
import re
import os
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime


from bs4 import BeautifulSoup


class MySocket:

    def __init__(self, sock=None):
        if sock is None:
            self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        else:
            self.sock = sock

    def connect(self, host, port):
        self.sock.connect((host, port))

    def myclose(self):
        self.sock.close()

    def mysend(self, msg, debug=False):
        if debug:
            print("MESSAGE SENT")
            print(msg.decode())
        self.sock.sendall(msg)

    def myreceive(self, debug=False):
        received = b''
        buffer = 1
        while True:
            part = self.sock.recv(buffer)
            received += part
            if part == b'':
                break
        if debug:
            print("Received...")
            print(received)
        return received


def get_image_urls(html):
    """
    Gets all urls corresponding to images in given html data
    :param data: html page to parse
    :return: list of image urls
    """
    soup = BeautifulSoup(html, "html.parser")
    img_urls = []
    for image in soup.find_all('img'):
        string = str(image)
        split = string.split("src=\"")
        if split[1][0] == '/':
            find_url = split[1].split("\"")
            url = find_url[0]
            img_urls.append(url)
            # img_urls.append("www.rit.edu" + url)
    return img_urls


def download_image(img_url):
    """
    Download images with the given socket and list of urls
    :param img_url: url corresponding to an image
    :return: None
    """
    image_socket = MySocket()
    image_socket.connect("www.rit.edu", 80)
    message = "GET " + img_url + " HTTP/1.0\r\n" \
              "Host: www.rit.edu\r\n" \
              "Accept: image/jpg, image/png\r\n\r\n"

    image_socket.mysend(message.encode(), debug=True)
    reply = image_socket.myreceive()
    headers = reply.split(b'\r\n\r\n')[0]

    print("RESPONSE HEADERS")
    print(headers.decode())
    print()

    image = reply[len(headers)+4:]
    img_name = str(len(os.listdir(".\\act1step2images"))) + img_url[-4:]
    f = open(os.path.join(".\\act1step2images", img_name), 'wb')
    f.write(image)
    f.close()


def download_images(image_urls, directory):
    if not os.path.exists(directory):
        os.mkdir(directory)

    pool = ThreadPool(100)
    pool.map(download_image, image_urls)
    pool.close()
    pool.join()


def main():
    start_time = datetime.now()

    host = "www.rit.edu"
    port = 80
    message = "GET /gccis/computingsecurity/people HTTP/1.0\r\n" \
              "Host: www.rit.edu\r\n" \
              "Accept: */*\r\n\r\n"
    part2_socket = MySocket()
    part2_socket.connect(host, port)
    part2_socket.mysend(message.encode())
    html = part2_socket.myreceive().decode()
    part2_socket.myclose()
    image_urls = get_image_urls(html)
    download_images(image_urls, ".\\act1step2images")
    print("Number of image urls:", len(image_urls))
    print("Number of files downloaded:", str(len(os.listdir(".\\act1step2images"))))

    print("Time elapsed:", datetime.now() - start_time)

main()

Tags: imageselfurlimgpngdefwwwsocket