beautifulsou使用特定字符串查找标题正下方和正上方的元素

<div class="fixres__body" data-url="" data-view="fixture-update" data-controller="fixture-update" data-fn="live-refresh" data-sport="football" data-lite="true" id="widgetLite-6"> <h3 class="fixres__header1">November 2018</h3> <h4 class="fixres__header2">Saturday 24th November</h4> <h5 class="fixres__header3">Prem League</h5> <div class="fixres__item">stuff in here</div> <h4 class="fixres__header2">Wednesday 28th November</h4> <h5 class="fixres__header3">UEFA Champ League</h5> <div class="fixres__item">stuff in here</div> <h3 class="fixres__header1">December 2018</h3> <h4 class="fixres__header2">Sunday 2nd December</h4> <h5 class="fixres__header3">Prem League</h5> <div class="fixres__item">stuff in here</div>

def squad_fixtures(): team_table = ['https://someurl.com/liverpool-fixtures'] for i in team_table: # team_fixture_urls = [i.replace('-squad', '-fixtures') for i in team_table] squad_r = requests.get(i) premier_squad_soup = BeautifulSoup(squad_r.text, 'html.parser') # print(premier_squad_soup) premier_fix_body = premier_squad_soup.find('div', {'class': 'fixres__body'}) # print(premier_fix_body) premier_fix_divs = premier_fix_body.find_all('div', {'class': 'fixres__item'}) for i in premier_fix_divs: team_home = i.find_all('span', {'class': 'matches__item-col matches__participant matches__participant--side1'}) for i in team_home: team_home_names = i.find('span', {'class': 'swap-text--bp30'})['title'] team_home_namesall.append(team_home_names) print(team_home_namesall)

1条回答

网友

1楼 · 发布于 2024-05-21 08:06:55

似乎您的挑战是将刮取限制在Premier League<h5>及其相关内容。你知道吗

Note: Your question states the string of the h5 should be Prem League, but it in fact appears to be Premier League when I look at the response.

这个HTML看起来非常扁平，结构上没有区别，所以看起来最好的办法是从h5开始遍历上一个和下一个兄弟姐妹，h5本身很容易定位：

import re

from bs4 import BeautifulSoup, Tag
import requests

prem_league_regex = re.compile(r"Premier League")


def squad_fixtures():
    team_table = ['https://www.skysports.com/liverpool-fixtures']

    for i in team_table:
        squad_r = requests.get(i)
        soup = BeautifulSoup(squad_r.text, 'html.parser')
        body = soup.find('div', {'class': 'fixres__body'})
        h5s = body.find_all('h5', {'class': 'fixres__header3'}, text=prem_league_regex)
        for h5 in h5s:
            prev_tag = find_previous(h5)
            if prev_tag.name == 'h4':
                print(prev_tag.text)
            prev_tag = find_previous(prev_tag)
            if prev_tag.name == 'h3':
                print(prev_tag.text)
            fixres_item_div = find_next(h5)
            """
                get the things you need from fixres__item now that you have it...
            """



def find_previous(tag):
    prev_tag = tag.previous_sibling
    while(not isinstance(prev_tag, Tag)):
        prev_tag = prev_tag.previous_sibling
    return prev_tag

def find_next(tag):
    next_tag = tag.next_sibling
    while(not isinstance(next_tag, Tag)):
        next_tag = next_tag.next_sibling
    return next_tag

相关问题更多 >

编程相关推荐

热门问题

热门文章