我正在使用Python套接字从一个网站下载图像。我对使用请求或urllib库不感兴趣。我想用线程加速这个过程。我用过多处理.dummy我以前在图书馆工作过,它通常对我有用。在这种情况下,它是非常不可预测的。当它应该下载53张图片(见下面的响应头)时,它通常会下载38到44张图片。我已经统计了响应头消息的数量,其中有53条。这似乎表明我应该收到53张照片。然而,我的代码每次保存的图像不到53个。有人能认出我在哪里用线弄错了吗?我下载了所有的图像没有利用线程,它工作得很好。这使我相信这是我的线程实现的一个问题。下面是我的脚本的一个运行示例。你知道吗
PS D:\Documents\School\RIT\Classes\Summer 2018\CSEC 380\Homework\3\Script> python .\hw3-script.py
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xAcharya.jpg.pagespeed.ic.dQLJ0KfusA.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xAbuaitah.jpg.pagespeed.ic.PFwk87Pcno.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/xCSEC.png.pagespeed.ic.Ep0KUkS94M.png HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png
MESSAGE SENT
GET /gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files//xJake,P20Brown.jpg.pagespeed.ic.KvGLjuuU03.jpg HTTP/1.0
Host: www.rit.edu
Accept: image/jpg, image/png
<...many more MESSAGE SENT and RESPONSE HEADERS...>
RESPONSE HEADERS
HTTP/1.1 200 OK
Date: Sun, 12 Aug 2018 15:19:44 GMT
Server: Apache
Link: <http://www.rit.edu/gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/CSEC.png>; rel="canonical"
X-Content-Type-Options: nosniff
Accept-Ranges: bytes
Expires: Mon, 12 Aug 2019 12:51:06 GMT
Cache-Control: max-age=31536000
Etag: W/"0"
Last-Modified: Sun, 12 Aug 2018 12:51:06 GMT
X-Original-Content-Length: 13647
Content-Length: 10131
Connection: close
Content-Type: image/png
RESPONSE HEADERS
HTTP/1.1 200 OK
Date: Sun, 12 Aug 2018 15:19:44 GMT
Server: Apache
Link: <http://www.rit.edu/gccis/computingsecurity/sites/rit.edu.gccis.computingsecurity/files/images/footer-logo.png>; rel="canonical"
X-Content-Type-Options: nosniff
Accept-Ranges: bytes
Expires: Mon, 12 Aug 2019 14:09:08 GMT
Cache-Control: max-age=31536000
Etag: W/"0"
Last-Modified: Sun, 12 Aug 2018 14:09:08 GMT
X-Original-Content-Length: 19921
Content-Length: 16125
Connection: close
Content-Type: image/png
Number of image urls: 53
Number of files downloaded: 42
Time elapsed: 0:00:04.019662
我的代码:
import sys
import socket
import re
import os
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
from bs4 import BeautifulSoup
class MySocket:
def __init__(self, sock=None):
if sock is None:
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
else:
self.sock = sock
def connect(self, host, port):
self.sock.connect((host, port))
def myclose(self):
self.sock.close()
def mysend(self, msg, debug=False):
if debug:
print("MESSAGE SENT")
print(msg.decode())
self.sock.sendall(msg)
def myreceive(self, debug=False):
received = b''
buffer = 1
while True:
part = self.sock.recv(buffer)
received += part
if part == b'':
break
if debug:
print("Received...")
print(received)
return received
def get_image_urls(html):
"""
Gets all urls corresponding to images in given html data
:param data: html page to parse
:return: list of image urls
"""
soup = BeautifulSoup(html, "html.parser")
img_urls = []
for image in soup.find_all('img'):
string = str(image)
split = string.split("src=\"")
if split[1][0] == '/':
find_url = split[1].split("\"")
url = find_url[0]
img_urls.append(url)
# img_urls.append("www.rit.edu" + url)
return img_urls
def download_image(img_url):
"""
Download images with the given socket and list of urls
:param img_url: url corresponding to an image
:return: None
"""
image_socket = MySocket()
image_socket.connect("www.rit.edu", 80)
message = "GET " + img_url + " HTTP/1.0\r\n" \
"Host: www.rit.edu\r\n" \
"Accept: image/jpg, image/png\r\n\r\n"
image_socket.mysend(message.encode(), debug=True)
reply = image_socket.myreceive()
headers = reply.split(b'\r\n\r\n')[0]
print("RESPONSE HEADERS")
print(headers.decode())
print()
image = reply[len(headers)+4:]
img_name = str(len(os.listdir(".\\act1step2images"))) + img_url[-4:]
f = open(os.path.join(".\\act1step2images", img_name), 'wb')
f.write(image)
f.close()
def download_images(image_urls, directory):
if not os.path.exists(directory):
os.mkdir(directory)
pool = ThreadPool(100)
pool.map(download_image, image_urls)
pool.close()
pool.join()
def main():
start_time = datetime.now()
host = "www.rit.edu"
port = 80
message = "GET /gccis/computingsecurity/people HTTP/1.0\r\n" \
"Host: www.rit.edu\r\n" \
"Accept: */*\r\n\r\n"
part2_socket = MySocket()
part2_socket.connect(host, port)
part2_socket.mysend(message.encode())
html = part2_socket.myreceive().decode()
part2_socket.myclose()
image_urls = get_image_urls(html)
download_images(image_urls, ".\\act1step2images")
print("Number of image urls:", len(image_urls))
print("Number of files downloaded:", str(len(os.listdir(".\\act1step2images"))))
print("Time elapsed:", datetime.now() - start_time)
main()
目前没有回答
相关问题 更多 >
编程相关推荐