在Python中使用BeautifulSoup4删除html并区分相同的标记

soup = BeautifulSoup(res.content, 'html.parser') html = soup.find_all("div", {"class": "forecast_tip"}) dateCond = [] for date in html: for text in date.find_all("div", {"class": "tip_date_time"}): dateCond.append(text.getText()) waveCond = [] for wave in html: for text in wave.find_all("span", {"class": "tip_wave"}): waveCond.append(text.getText())

<div class="forecast_tip"> <div class="tip_date_time">6am Mon 21 Sep</div> <div class="tip_surf"> <span class="tip_wave">2ft ENE</span> <span class="tip_wind">7kt NNW</span> </div> <div class="tip_description">(Waist-Shoulder High)</div> <div class="tip_train">1.3m @ 7.5s ENE (64°)</div> <div class="tip_train">0.4m @ 13.1s SSE (167°)</div> <div class="tip_train">0.3m @ 13.8s SSW (194°)</div> <div class="tip_tides"> <div class="tip_tide"> <span class="tip_tide_label">Low:</span> <span class="tip_tide_value">Sun 4:29pm (0.20m)</span> </div> <div class="tip_tide"> <span class="tip_tide_label">High:</span> <span class="tip_tide_value">Sun 10:40pm (1.67m)</span> </div> </div> </div>

url = "https://www.swellnet.com/reports/australia/new-south-wales/northern-beaches/forecast" res = requests.get(url) res.raise_for_status soup = BeautifulSoup(res.content, 'html.parser') forecast = soup.find_all("div", {"class": "forecast_tip"}) # scrapes the swell train block of code for the whole div tag that includes class forecast_tip. will ouput 9 items (3 days x 6am, 12pm, 6pm) def getData(html, attribute, _class, index): result = [] for tag in html: for item in tag.find_all(attribute, {"class": _class})[index]: if item is not None: result.append(item) else: result.append("N/A") return result date = getData(forecast, "div", "tip_date_time", 0) train1 = getData(forecast, "div", "tip_train", 0) train2 = getData(forecast, "div", "tip_train", 1) wave = getData(forecast, "span", "tip_wave", 0) logging.debug(date) logging.debug(train1) logging.debug(train2) logging.debug(wave) forecast_data = list(zip(date, train1, train2, wave)) headers = ["Date", "Primary Swell", "Secondary Swell", "Wave Height"] print(tabulate([*forecast_data], headers=headers))

Date Primary Swell Secondary Swell Wave Height --------------- ----------------------- ----------------------- ------------- 6am Wed 23 Sep 0.6m @ 8.3s NE (54°) 0.2m @ 13s SSW (195°) 1ft NE 12pm Wed 23 Sep 0.5m @ 8.4s NE (54°) 0.2m @ 12.3s SSW (194°) 1ft NE 6pm Wed 23 Sep 0.4m @ 8.4s NE (56°) 0.2m @ 11.1s SSW (200°) 1ft NE 6am Thu 24 Sep 0.4m @ 10.1s SSW (204°) 0.2m @ 9.9s ENE (77°) 0.5ft SSW 12pm Thu 24 Sep 0.6m @ 10.1s SSW (205°) 0.3m @ 9.8s ENE (73°) 1ft SSW 6pm Thu 24 Sep 0.7m @ 9.9s SSW (203°) 0.2m @ 9.8s ENE (77°) 1ft SSW 6am Fri 25 Sep 0.6m @ 9.1s SSW (197°) 0.2m @ 12.5s SSE (165°) 1ft SSW 12pm Fri 25 Sep 0.3m @ 12.1s S (169°) 0.5m @ 8.9s SSW (192°) 0.5ft S 6pm Fri 25 Sep 0.5m @ 8.8s S (188°) 0.3m @ 11.6s S (169°) 0.5ft S

1条回答

网友

1楼 · 发布于 2024-04-24 23:32:13

您实际上不必使用两个tip_train实例。您仍然可以刮取所有数据，如果有任何缺失，替换缺失的部分并打印您得到的数据

这里有一种方法：

import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


url = "https://www.swellnet.com/reports/australia/new-south-wales/northern-beaches/forecast"
response = requests.get(url)
forecast = BeautifulSoup(response.content, 'html.parser').find_all("div", {"class": "forecast_tip"})


def get_data(html, attribute: str, _class: str) -> list:
    result = []

    for tag in html:
        item = tag.find(attribute, {"class": _class})
        if item is not None:
            result.append(item.getText())
        else:
            result.append("N/A")

    return result


date = get_data(forecast, "div", "tip_date_time")
train = get_data(forecast, "div", "tip_train")
wave = get_data(forecast, "span", "tip_wave")

forecast_data = list(zip(date, train, wave))
headers = ["Date", "Swell Train Data", "Wave Height"]

print(tabulate([*forecast_data], headers=headers))

这张照片是：

Date             Swell Train Data         Wave Height
       -             -        -
6am Wed 23 Sep   0.6m @ 8.3s NE (54°)     1ft NE
12pm Wed 23 Sep  0.5m @ 8.4s NE (54°)     1ft NE
6pm Wed 23 Sep   0.4m @ 8.4s NE (56°)     1ft NE
6am Thu 24 Sep   0.4m @ 10.1s SSW (204°)  0.5ft SSW
12pm Thu 24 Sep  0.6m @ 10.1s SSW (205°)  1ft SSW
6pm Thu 24 Sep   0.7m @ 9.9s SSW (203°)   1ft SSW
6am Fri 25 Sep   0.6m @ 9.1s SSW (197°)   1ft SSW
12pm Fri 25 Sep  0.3m @ 12.1s S (169°)    0.5ft S
6pm Fri 25 Sep   0.5m @ 8.8s S (188°)     0.5ft S

相关问题更多 >

编程相关推荐

热门问题

热门文章