如何使用BeautifulSoup获取href

from selenium import webdriver from selenium.webdriver.common.keys import Keys import requests from bs4 import BeautifulSoup # agent user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36" # headless driver options = webdriver.ChromeOptions() options.headless = True options.add_argument(f'user-agent={user_agent}') options.add_argument("--window-size=1920,1080") options.add_argument('--ignore-certificate-errors') options.add_argument('--allow-running-insecure-content') options.add_argument("--disable-extensions") options.add_argument("--proxy-server='direct://'") options.add_argument("--proxy-bypass-list=*") options.add_argument("--start-maximized") options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') options.add_argument('--no-sandbox') driver = webdriver.Chrome(executable_path="D:\Downloads\chromedriver.exe", options=options) # request test medecine = 'doliprane' # submiting a search driver.get('http://www.dpm.tn/dpm_pharm/medicament/listmedicparnomspec.php') e = driver.find_element_by_name('id') e.send_keys(medecine) e.submit() # geting the result table try: table = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody') print('succes') except: print('failed')

print('bs4 turn \n') result = BeautifulSoup(table.get_attribute('innerHTML'), 'lxml') rows = result.find_all('tr') links = [] real_link = [] for row in rows: links.append(row.find('a', href= True)) for each in links: print(each['href'])

3条回答

网友
1楼 · 编辑于 2024-05-19 02:10:22

访问时，请尝试：
print('bs4 turn \n') result = BeautifulSoup(table.get_attribute('innerHTML'), 'lxml') rows = result.find_all('tr') links = [] real_link = [] for row in rows: a = row.find("a", href=True) links.append(a['href']) for each in links: print(each)

网友
2楼 · 编辑于 2024-05-19 02:10:22

不要使用selenium，而是使用请求库获取数据并对其进行解析
代码：
import re import requests from bs4 import BeautifulSoup medecine = 'doliprane' url = "http://www.dpm.tn/dpm_pharm/medicament/listmedicspec.php" payload = {"id":medecine} response = requests.post(url, data=payload) parsedhtml=BeautifulSoup(response.content,"html.parser") regex = re.compile('fiche.php.*') atag=parsedhtml.findAll("a",{"href":regex}) links =[i['href'].replace("fiche.php","http://www.dpm.tn/dpm_pharm/medicament/fiche.php") for i in atag ] print(links)
如果您有任何问题，请告诉我：）

网友
3楼 · 编辑于 2024-05-19 02:10:22

我解决了这个问题，但用硒代替了靓汤：

    for i in range(2, max):
        a_driver = driver.find_element_by_xpath(f'/html/body/table/tbody/tr/td/table/tbody/tr[{i}]/td[11]/a')

        result2 = BeautifulSoup(a_driver.get_attribute('innerHTML'), 'lxml')

        link = a_driver.get_attribute('href')
        links.append(link)

    for i in range(0, len(links)):
        print(links[i])

这是为我工作

相关问题更多 >

编程相关推荐

热门问题

热门文章