使用Python的for循环遍历链接进行网页抓取

0 投票
1 回答
49 浏览
提问于 2025-04-12 15:13

我正在尝试把一些链接放进一个循环里,以便抓取一些网页并填充一个数据框。

这是我的代码:

from typing import Generator
from requests_html import HTMLSession
import pandas as pd
import numpy as np
from itertools import islice
import mysql.connector
from sqlalchemy import create_engine

torneo = 0

for i in torneo:
   i + 1

match torneo:   
   
    case "0":
        matchlinkok = "https://www.betexplorer.com/football/chile/primera-division/results/"
        camp = "Chi-A"

    case "1":
        matchlinkok = "https://www.betexplorer.com/football/algeria/ligue-1/results/"
        camp = "Alg-A"

    case "2":
        matchlinkok = "https://www.betexplorer.com/football/australia/a-league/results/"
        camp = "Aus-A" 

    case "3":
        matchlinkok = "https://www.betexplorer.com/football/austria/bundesliga/results/"
        camp = "Aut-A"   

    case _:
        print("nothing???")
   

matchlink = matchlinkok

NM = 20

def _get_rows(url: str) -> Generator[dict[str, str], None, None]:
    session = HTMLSession()

    r = session.get(matchlink)

    allmatch = r.html.find(".in-match")
    results = r.html.find(".h-text-center a")
    matchodds = r.html.find("[data-odd]")

    odds = [matchodd.attrs["data-odd"] for matchodd in matchodds]

    idx = 0
    N = NM
    for match, res in islice(zip(allmatch, results), N):

        if res.text == "POSTP.":
            continue

        if res.text == "0:2 ABN.":
            continue

        if res.text == "0:3 AWA.":
            continue

        if res.text == "3:0 AWA.":
            continue

        if res.text == "0:2 CAN.":
            continue
        
        if res.text == "2:0 CAN.":
            continue

        if res.text == "1:1 CAN.":
            continue
       
 

        print(f"{match.text} Z {res.text} {', '.join(odds[idx:idx+3])}")
        yield {
            "match": match.text,
            "result": res.text,
            "odds": ", ".join(odds[idx : idx + 3]),
            "best_bets": (odds[idx]), "oddtwo": (odds[idx+1]), "oddthree": (odds[idx+2]),
        }

        idx += 3


if __name__ == "__main__":
    df = pd.DataFrame(_get_rows(matchlink))#.set_index("match") 
    df[['home','away']] = df['match'].str.split("-",expand=True)     
    df[['scorehome','scoreaway']] = df['result'].str.split(":",expand=True) 
    df = df.astype({'scorehome':'int'})
    df = df.astype({'scoreaway':'int'})
    cols = ['scorehome', 'scoreaway']
    #df['tournament'] = "Oma-A"
    df['tournament'] = camp
    df['sum_stats'] = df[cols].sum(axis=1, numeric_only=True)
    df['over05'] = np.where(df['sum_stats']>0, 'OK', 'NO')
    df['over15'] = np.where(df['sum_stats']>1, 'OK', 'NO')
    df['over25'] = np.where(df['sum_stats']>2, 'OK', 'NO')
    df['over35'] = np.where(df['sum_stats']>3, 'OK', 'NO')
    df['over45'] = np.where(df['sum_stats']>4, 'OK', 'NO')
    df['goal'] = np.where((df['scorehome'] >= 1) & 
         (df['scoreaway'] >= 1), 'OK', 'NO')
    df['esito'] = [ '1' if score_home > score_away else '2' if score_home < score_away else 'X' 
               for score_home, score_away in zip(df['scorehome'], df['scoreaway'])]
    
    df['result'] = df['result'].str.replace(':','-')
    colss = ['home', 'away', 'result']
    df['uniquefield'] = df[colss].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    df = df[['home','away','scorehome','scoreaway','result','best_bets','oddtwo','oddthree', 'sum_stats', 'over05', 'over15', 'over25', 'over35', 'over45', 'goal', 'esito', 'tournament', 'uniquefield']]

    print(df)

目前我遇到了这个错误:

for i in torneo: TypeError: 'int' object is not iterable

我不确定缩进是否正确,但我觉得应该没问题。

我的目标是循环遍历整数,并把它传递给比赛的情况,从中获取每个情况的链接,然后把这些链接放到matchlink里。

谢谢你的时间。

1 个回答

2

与其使用某种计数器,不如把网址的各个部分放到一个列表里,这样你就可以直接遍历这个列表了。

在这段代码中,_get_rows() 的实现只是一个占位符。

from collections.abc import Iterable

parts = [
    ("chile", "primera-division", "Chi-A"),
    ("algeria", "ligue-1", "Alg-A"),
    ("australia", "a-league", "Aus-A"),
    ("austria", "bundesliga", "Aut-A")
]

def _get_rows(url: str) -> Iterable:
    ...

for country, division, _ in parts:
    url = f"https://www.betexplorer.com/football/{country}/{division}/results"
    _get_rows(url)

撰写回答