python脚本正在持续运行

import requests from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup import colorama class Scanner: colorama.init() def __init__(self, url): self.target_url = url self.target_links = [] def is_valid(self, url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) def get_all_website_links(self, url): GREEN = colorama.Fore.GREEN WHITE = colorama.Fore.WHITE RESET = colorama.Fore.RESET urls = set() internal_urls = set() external_urls = set() domain_name = urlparse(url).netloc response = requests.get(url) soup = BeautifulSoup(response.content, "html.parser") for a_tag in soup.findAll("a"): href = a_tag.attrs.get("href") if href == "" or href is None: continue href = urljoin(url, href) parsed_href = urlparse(href) href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if not self.is_valid(href): continue if href in internal_urls: continue if domain_name not in href: if href not in external_urls: print(f"{WHITE}[*] External link: {href}{RESET}") external_urls.add(href) continue print(f"{GREEN}[*] Internal link: {href}{RESET}") urls.add(href) internal_urls.add(href) return urls def crawl(self, url): href_links = self.get_all_website_links(url) for link in href_links: print(link) self.crawl(link)

1条回答

网友

1楼 · 发布于 2024-05-14 23:01:26

以下部分（几乎）是一个无限递归：

for link in href_links:
    print(link)
    self.crawl(link)

我相信你是在页面中爬行链接的概念上添加这一点的。但你没有设定停车条件。（尽管目前，似乎您唯一的停止条件是存在一个完全没有链接的已爬网页面）

一个停止条件可能是设置要爬网的预定义“最大”级别数

在init函数中类似于以下内容：

def __init__(self, url):
    self.target_url = url
    self.target_links = []
    self.max_parse_levels = 5 #you can go a step further and make this as an input to the constructore (i.e. __init__ function)
    self.cur_parse_levels = 0
.
.
.

def crawl(url):
    if self.cur_parse_levels > self.max_parse_levels:
        return
    for link in href_links:
        print(link)
        self.crawl(link)

相关问题更多 >

编程相关推荐

热门问题

热门文章