查找用于抓取网站的正确元素

import requests import re from bs4 import BeautifulSoup master_request = requests.get("https://www.ecb.europa.eu/") base_url = "https://www.ecb.europa.eu" master_soup = BeautifulSoup(master_request.content, 'html.parser') master_atags = master_soup.find_all("a", href=True) master_links = [ ] sub_links = {} for master_atag in master_atags: master_href = master_atag.get('href') master_href = base_url + master_href print(master_href) master_links.append(master_href) sub_request = requests.get(master_href) sub_soup = BeautifulSoup(sub_request.content, 'html.parser') sub_atags = sub_soup.find_all("a", href=True) sub_links[master_href] = [] for sub_atag in sub_atags: sub_href = sub_atag.get('href') sub_links[master_href].append(sub_href) print("\t"+sub_href)

for master_atag in master_atags: master_href = master_atag.get('href') for href in master_href: master_href = [base_url + master_href if str(master_href).find(".en") in master_herf print(master_href)

for links in sublinks: resp = requests.get(sublinks) soup = BeautifulSoup(resp.content, 'html5lib') article = soup.find('article') title = soup.find('title') textdate = soup.find('h2') paragraphs = article.find_all('p') matches = re.findall('(\d{2}[\/ ](\d{2}|January|Jan|February|Feb|March|Mar|April|Apr|May|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)[\/ ]\d{2,4})', str(textdate)) for match in matches: print(match[0]) datadate = match[0] import pandas as pd ecbdf = pd.DataFrame({"Article": [Article]; "Title": [title]: "Text": [paragraphs], "date": datadate})

import feedparser from pandas.io.json import json_normalize import pandas as pd import requests rss_url='https://www.ecb.europa.eu/home/html/rss.en.html' ecb_feed = feedparser.parse(rss_url) df_ecb_feed=json_normalize(ecb_feed.entries) df_ecb_fead.head()

1条回答

网友

1楼 · 发布于 2024-06-16 11:29:40

抓取这个站点的最大问题可能是延迟加载：使用JavaScript，他们从几个html页面加载文章，并将它们合并到列表中。有关详细信息，请查看源代码中的index_include。这对于仅使用请求和BeautifulSoup进行刮取是有问题的，因为您的soup实例从请求内容中获得的只是基本框架，而没有文章列表。现在您有两个选择：

使用延迟加载的文章列表，而不是主要的文章列表页面（新闻稿、采访等），例如/press/pr/date/2019/html/index_include.en.html。这可能是一个更容易的选择，但你必须在你感兴趣的每一年都这样做
使用可以执行类似Selenium的JavaScript的客户端来获取HTML而不是请求

除此之外，我建议使用CSS选择器从HTML代码中提取信息。这样，你只需要写几行文章就可以了。另外，如果你使用index.en.html页面进行抓取，我认为你不必过滤英语文章，因为它默认显示英语，如果可用，还显示其他语言

下面是一个我快速组合的示例，这当然可以优化，但它展示了如何使用Selenium加载页面并提取文章URL和文章内容：

from bs4 import BeautifulSoup
from selenium import webdriver

base_url = 'https://www.ecb.europa.eu'
urls = [
    f'{base_url}/press/pr/html/index.en.html',
    f'{base_url}/press/govcdec/html/index.en.html'
]
driver = webdriver.Chrome()

for url in urls:
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    for anchor in soup.select('span.doc-title > a[href]'):
        driver.get(f'{base_url}{anchor["href"]}')
        article_soup = BeautifulSoup(driver.page_source, 'html.parser')

        title = article_soup.select_one('h1.ecb-pressContentTitle').text
        date = article_soup.select_one('p.ecb-publicationDate').text
        paragraphs = article_soup.select('div.ecb-pressContent > article > p:not([class])')
        content = '\n\n'.join(p.text for p in paragraphs)

        print(f'title: {title}')
        print(f'date: {date}')
        print(f'content: {content[0:80]}...')

我在新闻稿页面获得以下输出：

title: ECB appoints Petra Senkovic as Director General Secretariat and Pedro Gustavo Teixeira as Director General Secretariat to the Supervisory Board                         
date: 20 December 2019                                    
content: The European Central Bank (ECB) today announced the appointments of Petra Senkov...

title: Monetary policy decisions                          
date: 12 December 2019                                    
content: At today’s meeting the Governing Council of the European Central Bank (ECB) deci...

相关问题更多 >

编程相关推荐

热门问题

热门文章