def crawl(self,urlObj):
'''Main function to crawl URL's '''
try:
if ((urlObj.valid) and (urlObj.url not in CRAWLED_URLS.keys())):
rsp = urlcon.urlopen(urlObj.url,timeout=2)
hCode = rsp.read()
soup = BeautifulSoup(hCode)
links = self.scrap(soup)
boolStatus = self.checkmax()
if boolStatus:
CRAWLED_URLS.setdefault(urlObj.url,"True")
else:
return
for eachLink in links:
if eachLink not in VISITED_URLS:
parsedURL = urlparse(eachLink)
if parsedURL.scheme and "javascript" in parsedURL.scheme:
#print("***************Javascript found in scheme " + str(eachLink) + "**************")
continue
'''Handle internal URLs '''
try:
if not parsedURL.scheme and not parsedURL.netloc:
#print("No scheme and host found for " + str(eachLink))
newURL = urlunparse(parsedURL._replace(**{"scheme":urlObj.scheme,"netloc":urlObj.netloc}))
eachLink = newURL
elif not parsedURL.scheme :
#print("Scheme not found for " + str(eachLink))
newURL = urlunparse(parsedURL._replace(**{"scheme":urlObj.scheme}))
eachLink = newURL
if eachLink not in VISITED_URLS: #Check again for internal URL's
#print(" Found child link " + eachLink)
CRAWL_BUFFER.append(eachLink)
with self._lock:
self.count += 1
#print(" Count is =================> " + str(self.count))
boolStatus = self.checkmax()
if boolStatus:
VISITED_URLS.setdefault(eachLink, "True")
else:
return
except TypeError:
print("Type error occured ")
else:
print("URL already present in visited " + str(urlObj.url))
except socket.timeout as e:
print("**************** Socket timeout occured*******************" )
except URLError as e:
if isinstance(e.reason, ConnectionRefusedError):
print("**************** Conn refused error occured*******************")
elif isinstance(e.reason, socket.timeout):
print("**************** Socket timed out error occured***************" )
elif isinstance(e.reason, OSError):
print("**************** OS error occured*************")
elif isinstance(e,HTTPError):
print("**************** HTTP Error occured*************")
else:
print("**************** URL Error occured***************")
except Exception as e:
print("Unknown exception occured while fetching HTML code" + str(e))
traceback.print_exc()
几点意见:
下面是一个简单的实现:它没有fecch内部链接(只有绝对形式的超链接),也没有任何错误处理(403404,没有链接,…),而且速度非常慢(在这种情况下,
multiprocessing
模块可以帮助很多)。在输出:
^{pr2}$下面是一个主要的爬网方法,用来递归地从网页中删除链接。此方法将抓取一个URL并将所有已爬网的URL放入缓冲区。现在,多个线程将等待从这个全局缓冲区弹出url并再次调用这个爬网方法。在
完整的源代码和指令可在https://github.com/tarunbansal/crawler上找到
相关问题 更多 >
编程相关推荐