如何解析网页中的查询字符串？

import requests from bs4 import BeautifulSoup from datetime import datetime import datetime import dateutil.parser import time import pytz """python espncricinfo library module https://github.com/dwillis/python-espncricinfo """ from espncricinfo.match import Match from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError """----time-zone-calculation----""" time_zone = pytz.timezone("Asia/Kolkata") datetime_today = datetime.datetime.now(time_zone) datestring_today = datetime_today.strftime("%Y-%m-%d") """------URL of page to parse-------with a date of today-----""" url = "http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today" """eg. url = http://www.espncricinfo.com/ci/engine/match/index.html?date=2018-02-12""" r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser') """"------parsing for matchno------""" match_no = [x['href'].split('/',4)[4].split('.')[0] for x in soup.findAll('a', href=True, text='Scorecard')] for p in match_no: """ where p is a match no, e.g p = '1122282'""" m = Match(p) m.latest_batting print(m.latest_batting)

['8890/scorecard/1118760/andhra-vs-tamil-nadu-group-c-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118743/assam-vs-odisha-group-a-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118745/bengal-vs-delhi-group-b-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118763/chhattisgarh-vs-vidarbha-group-d-vijay-hazare-trophy-2017-18/']

1条回答

网友

1楼 · 发布于 2024-04-28 04:30:44

首先，你的代码很难阅读。你需要让你的代码呼吸，让它吸引其他人阅读。你知道吗

第二，引发问题的原因可能是：

match_no = [x['href'].split('/',4)[4].split('.')[0] for x in soup.findAll('a', href=True, text='Scorecard')]

它也很难阅读。从URL解析匹配id有更好、更可读的方法。你知道吗

这是一个应该起作用的例子。我确定了比赛的临时日期：

import re

import pytz
import requests
import datetime
from bs4 import BeautifulSoup
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
from espncricinfo.match import Match

"""python espncricinfo library module https://github.com/dwillis/python-espncricinfo """
# from espncricinfo.match import Match


def get_match_id(link):
    match_id = re.search(r'([0-9]{7})', link)
    if match_id is None:
        return None
    return match_id.group()

#   time-zone-calculation  
time_zone = pytz.timezone("Asia/Kolkata")
datetime_today = datetime.datetime.now(time_zone)
datestring_today = datetime_today.strftime("%Y-%m-%d")

#    URL of page to parse   -with a date of today  -
url = "http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today"

r = requests.get(url)

soup = BeautifulSoup(r.text, 'html.parser')

spans = soup.findAll('span', {"class": "match-no"})

matches_ids = []

for s in spans:
    for a in s.findAll('a', href=lambda href: 'scorecard' in href):
        match_id = get_match_id(a['href'])
        if match_id is None:
            continue
        matches_ids.append(match_id)


#    parsing for matchno   
for p in matches_ids:
    # where p is a match no, e.g p = '1122282'
    m = Match(p)
    m.latest_batting
    print(m.latest_batting)

现在，我没有你在这里使用的所有lib，但这应该给你一个如何做的想法。你知道吗

再一次，我的建议是空行是你的朋友。他们肯定是读者的朋友。让你的代码“呼吸”。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章