beauthulsoup抓取网页

from bs4 import BeautifulSoup as BS import requests import pandas as pd url_list = ['http://www.hkjc.com/english/racing/horse.asp?HorseNo=S217','http://www.hkjc.com/english/racing/horse.asp?HorseNo=A093','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V344','http://www.hkjc.com/english/racing/horse.asp?HorseNo=V077', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=P361', 'http://www.hkjc.com/english/racing/horse.asp?HorseNo=T103'] res=[] #placing res outside of loop for link in url_list: r = requests.get(link) r.encoding = 'utf-8' html_content = r.text soup = BS(html_content, 'lxml') table = soup.find('table', class_='bigborder') if not table: continue trs = table.find_all('tr') if not trs: continue #if trs are not found, then starting next iteration with other link headers = trs[0] headers_list=[] for td in headers.find_all('td'): headers_list.append(td.text) headers_list+=['Season'] headers_list.insert(19,'pseudocol1') headers_list.insert(20,'pseudocol2') headers_list.insert(21,'pseudocol3') row = [] season = '' for tr in trs[1:]: if 'Season' in tr.text: season = tr.text else: tds = tr.find_all('td') for td in tds: row.append(td.text.strip('\n').strip('\r').strip('\t').strip('"').strip()) row.append(season.strip()) res.append(row) row=[] res = [i for i in res if i[0]!=''] #outside of loop df=pd.DataFrame(res, columns=headers_list) #outside of loop del df['pseudocol1'],df['pseudocol2'],df['pseudocol3'] del df['VideoReplay']

2条回答

网友

1楼 · 编辑于 2024-05-15 21:01:08

在我看来，如果你不重置row，那么你将总是重复存储以前的结果，越来越多，上面的res.append(row)。在

网友

2楼 · 编辑于 2024-05-15 21:01:08

循环中的row=[]将清除列表，使其再次为空。由于列表在for循环之前声明了一次，否则它将继承在一个for迭代中附加到另一个迭代中的元素。执行row=[]操作会再次将其清除为空列表。在

相关问题更多 >

编程相关推荐

热门问题

热门文章