如何刮索菲法网站的立场。span靓汤里面的文字

from bs4 import BeautifulSoup import requests import re import pandas as pd # Get basic players information for all players base_url = "https://sofifa.com/players?offset=" columns = ['ID', 'Name', 'Age', 'Positions','Nationality', 'Overall', 'Potential', 'Club', 'Value', 'Wage',] data = pd.DataFrame(columns = columns) for offset in range(0, 335): url = base_url + str(offset * 60) source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, 'html.parser') table_body = soup.find('tbody') for row in table_body.findAll('tr'): td = row.findAll('td') pid = td[0].find('img').get('id') nationality = td[1].find('img').get('title') name = td[1].find("a").get("data-tooltip") rel = td[1].findAll('a',{'rel': 'nofollow'}) pos= rel[0].findAll('span') for span in pos : positions= (span.text.split) age = td[2].text overall = td[3].text.strip() potential = td[4].text.strip( ) club = td[5].find('a').text value = td[6].text.strip() wage = td[7].text.strip() player_data = pd.DataFrame([[pid, name, age, positions, nationality, overall, potential, club, value, wage]]) player_data.columns = columns data = data.append(player_data, ignore_index=True) print("done for "+str(offset),end="\r") data.drop_duplicates() data.head() data.to_csv('player data.csv', encoding='utf-8-sig')

1条回答

网友

1楼 · 发布于 2024-06-09 15:09:32

要获取以逗号分隔的字符串形式的位置，可以尝试：

import requests
from bs4 import BeautifulSoup


def get_data(offset):
    url = "https://sofifa.com/players?offset=" + str(offset * 60)
    soup = BeautifulSoup(requests.get(url).content, "html.parser")

    rv = []
    for row in soup.select("tbody tr"):
        id_ = row.select_one("img[id]")["id"]
        name = row.select_one(".col-name [data-tooltip]")["data-tooltip"]
        age = row.select_one(".col-ae").get_text(strip=True)
        positions = [p.get_text(strip=True) for p in row.select("span.pos")]
        nationality = row.select_one("img.flag")["title"]
        overall = row.select_one(".col-oa").get_text(strip=True)
        potential = row.select_one(".col-pt").get_text(strip=True)
        club = row.select_one(".col-name > div > a").get_text(strip=True)

        # sometimes there isn't any club, just country:
        if club == "":
            club = row.select_one(".col-name > div > a")["title"]

        value = row.select_one(".col-vl").get_text(strip=True)
        wage = row.select_one(".col-wg").get_text(strip=True)
        rv.append(
            [
                id_,
                name,
                age,
                ", ".join(positions),
                nationality,
                overall,
                potential,
                club,
                value,
                wage,
            ]
        )

    return rv


all_data = []
for offset in range(0, 3):  # < - increase offset here
    print("Offset {}...".format(offset))
    all_data.extend(get_data(offset))

df = pd.DataFrame(
    all_data,
    columns=[
        "ID",
        "Name",
        "Age",
        "Positions",
        "Nationality",
        "Overall",
        "Potential",
        "Club",
        "Value",
        "Wage",
    ],
)

print(df)
df.to_csv("data.csv", index=False)

印刷品：

...

141  241637               Aurélien Tchouaméni  20       CM, CDM          France      77        85                 AS Monaco     €23M   €35K
142  258315             Bright Akwo Arrey-Mbi  17        CB, LB         Germany      62        85         Bayern München II    €1.2M   €500
143  245367                       Xavi Simons  17            CM     Netherlands      65        84       Paris Saint-Germain    €1.8M    €2K
144  207865                Marcos Aoás Corrêa  26       CB, CDM          Brazil      87        90       Paris Saint-Germain   €92.5M  €135K
145  241852                      Moussa Diaby  20        LW, LM          France      81        88       Bayer 04 Leverkusen     €51M   €60K
146  188567         Pierre-Emerick Aubameyang  31        ST, LW           Gabon      85        85                   Arsenal   €45.5M  €145K

...

并保存data.csv（LibreOffice的屏幕截图）：

相关问题更多 >

编程相关推荐

热门问题

热门文章