Python3打印特定href链接

2024-05-15 01:16:02 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试获取脚本来抓取一个站点,只查找具有的HREF。php?id=我可以使用bs4打印所有HREF,但无法从.php?id=中选择一个并打印它们

<li><a href="#">Education & Research </a>
<ul>                         
<li><a href="caseofthe_month.php">Case of the Month</a></li>
<a href="page.php?id=2">
<a href="idontwantthispagetoshowup.php">
<a href="page.php?id=5">Prospectus Fellowship-July-14</a>
<a href="thisoneeither.php">

'''

def gethref(ip):
    url = ("http://" + ip)
    print("[x] ~ SCAN: " + url + " ~ [x]")
    req = requests.get(url)
    tree = html.fromstring(req.text)
    tree_href = tree.xpath('//@href')
    #print(tree_href)
    if '*.php?id=*' in tree_href:
        print (tree_href)
    #soup = BeautifulSoup(req.text, 'html.parser')
    #h = soup.find_all('href=*.php')
    #print(h)
    #sqli = soup.select('a')
    #for link in soup.find_all('a'):
    #   sqli = (link.get('href'))
    #   sqli = str(sqli)
    #   print(sqli)
    #   if 'page' in sqli:
    #       print(sqli.a)

Tags: inipidtreeurlgetpageli
2条回答

您可以使用CSS选择器a[href*=".php?id="]

from bs4 import BeautifulSoup

html_doc = """
<li><a href="#">Education & Research</a>

<ul>                         
<li>
    <a href="caseofthe_month.php">Case of the Month</a>
</li>
</ul>

<a href="page.php?id=2"></a>
<a href="idontwantthispagetoshowup.php">
<a href="page.php?id=5">Prospectus Fellowship-July-14</a>
<a href="thisoneeither.php"></a>
"""

soup = BeautifulSoup(html_doc, "html.parser")

for link in soup.select('a[href*=".php?id="]'):
    print(link["href"])

印刷品:

page.php?id=2
page.php?id=5

或:

for link in soup.find_all("a"):
    if ".php?id=" in link.get("href", ""):
        print(link["href"])

或:

for link in soup.find_all(
    lambda t: t.name == "a" and ".php?id=" in t.get("href", "")
):
    print(link["href"])

这是查找包含.php?id=的所有HREF所需的代码

from bs4 import BeautifulSoup
import requests
import re

def gethref(ip):
    url = ("http://" + ip)
    print("[x] ~ SCAN: " + url + " ~ [x]")
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')
    h = soup.find_all(href=re.compile(r'(.*).php\?id=\d*'))
    print(h)
    # sqli = soup.select('a') # i don't know what its doing, so i just commented it out
    # for link in soup.find_all('a'):
    #   sqli = str(link.get('href'))
    #   print(sqli)
    #   if 'page' in sqli:
    #       print(sqli.a)

我想这就是你需要的

告诉我如果它不起作用

相关问题 更多 >

    热门问题