我试图建立一个网络爬虫提取网页上的所有链接。我已经创建了2个python文件。(类别:scanner.py和对象:漏洞scanner.py)。当我运行脚本时,它一直在不停地运行。我找不到错误。帮我解决这个问题
以下是我的源代码:
scanner.py
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
class Scanner:
colorama.init()
def __init__(self, url):
self.target_url = url
self.target_links = []
def is_valid(self, url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(self, url):
GREEN = colorama.Fore.GREEN
WHITE = colorama.Fore.WHITE
RESET = colorama.Fore.RESET
urls = set()
internal_urls = set()
external_urls = set()
domain_name = urlparse(url).netloc
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
continue
href = urljoin(url, href)
parsed_href = urlparse(href)
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not self.is_valid(href):
continue
if href in internal_urls:
continue
if domain_name not in href:
if href not in external_urls:
print(f"{WHITE}[*] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls
def crawl(self, url):
href_links = self.get_all_website_links(url)
for link in href_links:
print(link)
self.crawl(link)
漏洞-scanner.py
import argu
target_url = "https://hack.me/"
vul_scanner = argu.Scanner(target_url)
vul_scanner.crawl(target_url)
以下部分(几乎)是一个无限递归:
我相信你是在页面中爬行链接的概念上添加这一点的。但你没有设定停车条件。(尽管目前,似乎您唯一的停止条件是存在一个完全没有链接的已爬网页面)
一个停止条件可能是设置要爬网的预定义“最大”级别数
在init函数中类似于以下内容:
相关问题 更多 >
编程相关推荐