我正在开发一个WebCrawler,但是我在从连接到的服务器读取证书时遇到了很多问题,我只需要读取证书并在屏幕上打印它,但是使用getpeercert()函数,证书总是{},我尝试使用get_server_certificate(),它返回一个错误,告诉我值太多,无法解压缩。 我知道在线程中也有一些错误,但我稍后会修复它。 已经提前谢谢了。
问题出在“acesso”函数上
#coding: utf8
import socket
import sys
import re
import ssl
import pprint
from threading import Thread
from urlparse import urlparse
threads = []
vetorLinks = []
linksVisitados = []
nThreads = 4 #numero de threads
def acesso(url):
print "Acessando: " + url
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
ssl_socket = ssl.wrap_socket(s)
urlParsed = urlparse(url)
try:
ssl_socket.connect((urlParsed[1], 443))
except socket.timeout:
print "Time out"
ssl.get_server_certificate(urlParsed[1])
pprint.pprint(ssl_socket.getpeercert())
request = "GET " + urlParsed[2] + " HTTP/1.1\r\nUser-Agent: Python\r\nHost: " + urlParsed[1] + "\r\nConnection: persistent\r\n\r\n"
ssl_socket.sendall(request)
dados = ''
try:
while(True):
buff = ssl_socket.recv(4096)
if not len(buff):
break
dados += buff
except socket.timeout:
print "Time Out"
ssl_socket.close()
return dados
def navega(url, profundidade, vetorLinks, visitados):
if not url in visitados and (url.startswith("http://") or url.startswith("https://")):
visitados.append(url) #salva a url no vetor dos visitados
html = acesso(url) #html da pagina lida
urls = re.findall(r"""<a href=[\'"]?([^\'" >]+)""", html)
vetorLinks.extend(urls)
if profundidade==0:
return
else:
try:
for i in range(0, len(vetorLinks)):
for j in range(0, nThreads):
t = Thread(target = navega, args = (vetorLinks[i], profundidade- 1, vetorLinks, linksVisitados))
t.start()
threads.append(t)
i += 1
for t in threads:
t.join()
except RuntimeError:
print "RuntimeError nas Threads."
navega(sys.argv[2], int(sys.argv[1]), vetorLinks, linksVisitados)
目前没有回答
相关问题 更多 >
编程相关推荐