用beautifulsoup进行深度解析

import requests from bs4 import BeautifulSoup def drug_data(): url = 'https://www.drugbank.ca/drugs/' while url: print(url) r = requests.get(url) soup = BeautifulSoup(r.text ,"lxml") #data = soup.select('name-head a') #for link in data: # href = 'https://www.drugbank.ca/drugs/' + link.get('href') # pages_data(href) # next page url url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'}) print(url) if url: url = 'https://www.drugbank.ca' + url[0].get('href') else: break drug_data()

2条回答

网友

1楼 · 编辑于 2024-04-19 05:35:07

为了有效地爬行，您需要实现一些措施，例如维护要访问的url队列，并知道您已经访问过哪些url。在

请记住，链接可以是绝对的或相对的，并且重定向很有可能，您可能还希望动态地构造url，而不是字符串连接。在

这里是一个通用的（我们通常只想使用example.comon SO）爬网工作流。。。在

from urllib.parse import urljoin, urlparse # python
# from urlparse import urljoin, urlparse # legacy python2
import requests
from bs4 import BeautifulSoup
def process_page(soup):
    '''data extraction process'''
    pass

def is_external(link, base='example.com'):
    '''determine if the link is external to base'''
    site = urlparse(link).netloc
    return base not in site

def resolve_link(current_location, href):
    '''resolves final location of a link including redirects'''
    req_loc = urljoin(current_location, href)
    response = requests.head(req_loc)
    resolved_location = response.url # location after redirects
    # if you don't want to visit external links...
    if is_external(resolved_location):
        return None
    return resolved_location

url_queue = ['https://example.com']
visited = set()
while url_queue:
    url = url_queue.pop() # removes a url from the queue and assign it to `url`
    response = requests.get(url)
    current_location = response.url # final location after redirects
    visited.add(url) # note that we've visited the given url
    visited.add(current_location) # and the final location
    soup = BeautifulSoup(response.text, 'lxml')
    process_page(soup) # scrape the page
    link_tags = soup.find_all('a') # gather additional links
    for anchor in link_tags:
        href = anchor.get('href')
        link_location = resolve_link(current_location, href)
        if link_location and link_location not in visited:
            url_queue.append(link_location)

网友

2楼 · 编辑于 2024-04-19 05:35:07

使用requests和BeautifulSoup创建函数以从子页面获取数据

import requests
from bs4 import BeautifulSoup

def get_details(url):
    print('details:', url)

    # get subpage
    r = requests.get(url)
    soup = BeautifulSoup(r.text ,"lxml")

    # get data on subpabe
    dts = soup.findAll('dt')
    dds = soup.findAll('dd')

    # display details
    for dt, dd in zip(dts, dds):
        print(dt.text)
        print(dd.text)
        print(' -')

    print('             -')

def drug_data():
    url = 'https://www.drugbank.ca/drugs/'

    while url:
        print(url)
        r = requests.get(url)
        soup = BeautifulSoup(r.text ,"lxml")

        # get links to subpages
        links = soup.select('strong a')
        for link in links:
            # exeecute function to get subpage
            get_details('https://www.drugbank.ca' + link['href'])

        # next page url
        url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
        print(url)
        if url:
            url = 'https://www.drugbank.ca' + url[0].get('href')
        else:
            break

drug_data()

相关问题更多 >

编程相关推荐

热门问题

热门文章