使用Python的for循环遍历链接进行网页抓取
我正在尝试把一些链接放进一个循环里,以便抓取一些网页并填充一个数据框。
这是我的代码:
from typing import Generator
from requests_html import HTMLSession
import pandas as pd
import numpy as np
from itertools import islice
import mysql.connector
from sqlalchemy import create_engine
torneo = 0
for i in torneo:
i + 1
match torneo:
case "0":
matchlinkok = "https://www.betexplorer.com/football/chile/primera-division/results/"
camp = "Chi-A"
case "1":
matchlinkok = "https://www.betexplorer.com/football/algeria/ligue-1/results/"
camp = "Alg-A"
case "2":
matchlinkok = "https://www.betexplorer.com/football/australia/a-league/results/"
camp = "Aus-A"
case "3":
matchlinkok = "https://www.betexplorer.com/football/austria/bundesliga/results/"
camp = "Aut-A"
case _:
print("nothing???")
matchlink = matchlinkok
NM = 20
def _get_rows(url: str) -> Generator[dict[str, str], None, None]:
session = HTMLSession()
r = session.get(matchlink)
allmatch = r.html.find(".in-match")
results = r.html.find(".h-text-center a")
matchodds = r.html.find("[data-odd]")
odds = [matchodd.attrs["data-odd"] for matchodd in matchodds]
idx = 0
N = NM
for match, res in islice(zip(allmatch, results), N):
if res.text == "POSTP.":
continue
if res.text == "0:2 ABN.":
continue
if res.text == "0:3 AWA.":
continue
if res.text == "3:0 AWA.":
continue
if res.text == "0:2 CAN.":
continue
if res.text == "2:0 CAN.":
continue
if res.text == "1:1 CAN.":
continue
print(f"{match.text} Z {res.text} {', '.join(odds[idx:idx+3])}")
yield {
"match": match.text,
"result": res.text,
"odds": ", ".join(odds[idx : idx + 3]),
"best_bets": (odds[idx]), "oddtwo": (odds[idx+1]), "oddthree": (odds[idx+2]),
}
idx += 3
if __name__ == "__main__":
df = pd.DataFrame(_get_rows(matchlink))#.set_index("match")
df[['home','away']] = df['match'].str.split("-",expand=True)
df[['scorehome','scoreaway']] = df['result'].str.split(":",expand=True)
df = df.astype({'scorehome':'int'})
df = df.astype({'scoreaway':'int'})
cols = ['scorehome', 'scoreaway']
#df['tournament'] = "Oma-A"
df['tournament'] = camp
df['sum_stats'] = df[cols].sum(axis=1, numeric_only=True)
df['over05'] = np.where(df['sum_stats']>0, 'OK', 'NO')
df['over15'] = np.where(df['sum_stats']>1, 'OK', 'NO')
df['over25'] = np.where(df['sum_stats']>2, 'OK', 'NO')
df['over35'] = np.where(df['sum_stats']>3, 'OK', 'NO')
df['over45'] = np.where(df['sum_stats']>4, 'OK', 'NO')
df['goal'] = np.where((df['scorehome'] >= 1) &
(df['scoreaway'] >= 1), 'OK', 'NO')
df['esito'] = [ '1' if score_home > score_away else '2' if score_home < score_away else 'X'
for score_home, score_away in zip(df['scorehome'], df['scoreaway'])]
df['result'] = df['result'].str.replace(':','-')
colss = ['home', 'away', 'result']
df['uniquefield'] = df[colss].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df = df[['home','away','scorehome','scoreaway','result','best_bets','oddtwo','oddthree', 'sum_stats', 'over05', 'over15', 'over25', 'over35', 'over45', 'goal', 'esito', 'tournament', 'uniquefield']]
print(df)
目前我遇到了这个错误:
for i in torneo: TypeError: 'int' object is not iterable
我不确定缩进是否正确,但我觉得应该没问题。
我的目标是循环遍历整数,并把它传递给比赛的情况,从中获取每个情况的链接,然后把这些链接放到matchlink里。
谢谢你的时间。
1 个回答
2
与其使用某种计数器,不如把网址的各个部分放到一个列表里,这样你就可以直接遍历这个列表了。
在这段代码中,_get_rows() 的实现只是一个占位符。
from collections.abc import Iterable
parts = [
("chile", "primera-division", "Chi-A"),
("algeria", "ligue-1", "Alg-A"),
("australia", "a-league", "Aus-A"),
("austria", "bundesliga", "Aut-A")
]
def _get_rows(url: str) -> Iterable:
...
for country, division, _ in parts:
url = f"https://www.betexplorer.com/football/{country}/{division}/results"
_get_rows(url)