用beautifulsoup进行深度解析

2024-04-19 05:35:07 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图解析https://www.drugbank.ca/drugs。每种药物的附加信息和名称都是提取出来的。正如你所看到的,每个网页代表一个表格,上面有药品名称,当我们点击药品名称时,我们可以访问这些药品信息。 假设我将保留以下代码来处理分页:

import requests
from bs4 import BeautifulSoup

def drug_data():
url = 'https://www.drugbank.ca/drugs/'

while url:
    print(url)
    r = requests.get(url)
    soup = BeautifulSoup(r.text ,"lxml")

    #data = soup.select('name-head a')
    #for link in data:
    #    href = 'https://www.drugbank.ca/drugs/' + link.get('href')
    #    pages_data(href)

    # next page url
    url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
    print(url)
    if url:
        url = 'https://www.drugbank.ca' + url[0].get('href')
    else:
        break

  drug_data()

问题是,在每一页中,对于本页表格中的每种药物,我需要捕获: 姓名。 登记号。 结构性显示, 普通处方药

我用的是经典的请求/美化,但不能深入。。在

请帮忙


Tags: https名称信息urldatagetwwwlink
2条回答

为了有效地爬行,您需要实现一些措施,例如维护要访问的url队列,并知道您已经访问过哪些url。在

请记住,链接可以是绝对的或相对的,并且重定向很有可能,您可能还希望动态地构造url,而不是字符串连接。在

这里是一个通用的(我们通常只想使用example.comon SO)爬网工作流。。。在

from urllib.parse import urljoin, urlparse # python
# from urlparse import urljoin, urlparse # legacy python2
import requests
from bs4 import BeautifulSoup
def process_page(soup):
    '''data extraction process'''
    pass

def is_external(link, base='example.com'):
    '''determine if the link is external to base'''
    site = urlparse(link).netloc
    return base not in site

def resolve_link(current_location, href):
    '''resolves final location of a link including redirects'''
    req_loc = urljoin(current_location, href)
    response = requests.head(req_loc)
    resolved_location = response.url # location after redirects
    # if you don't want to visit external links...
    if is_external(resolved_location):
        return None
    return resolved_location

url_queue = ['https://example.com']
visited = set()
while url_queue:
    url = url_queue.pop() # removes a url from the queue and assign it to `url`
    response = requests.get(url)
    current_location = response.url # final location after redirects
    visited.add(url) # note that we've visited the given url
    visited.add(current_location) # and the final location
    soup = BeautifulSoup(response.text, 'lxml')
    process_page(soup) # scrape the page
    link_tags = soup.find_all('a') # gather additional links
    for anchor in link_tags:
        href = anchor.get('href')
        link_location = resolve_link(current_location, href)
        if link_location and link_location not in visited:
            url_queue.append(link_location)

使用requestsBeautifulSoup创建函数以从子页面获取数据

import requests
from bs4 import BeautifulSoup

def get_details(url):
    print('details:', url)

    # get subpage
    r = requests.get(url)
    soup = BeautifulSoup(r.text ,"lxml")

    # get data on subpabe
    dts = soup.findAll('dt')
    dds = soup.findAll('dd')

    # display details
    for dt, dd in zip(dts, dds):
        print(dt.text)
        print(dd.text)
        print(' -')

    print('             -')

def drug_data():
    url = 'https://www.drugbank.ca/drugs/'

    while url:
        print(url)
        r = requests.get(url)
        soup = BeautifulSoup(r.text ,"lxml")

        # get links to subpages
        links = soup.select('strong a')
        for link in links:
            # exeecute function to get subpage
            get_details('https://www.drugbank.ca' + link['href'])

        # next page url
        url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
        print(url)
        if url:
            url = 'https://www.drugbank.ca' + url[0].get('href')
        else:
            break

drug_data()

相关问题 更多 >