如何使用lxml抓取一个表并获取href链接？

import requests import lxml.html as lh import pandas as pd # Sample site where the table is response = requests.get('https://especiais.gazetadopovo.com.br/futebol/tabela-campeonato-brasileiro-2018') #Store the contents of the website under doc doc = lh.fromstring(response.content) #Parse data that are stored between <tr>..</tr> of HTML tr_elements = doc.xpath('//tr') col=[] i=0 #For each row, store each first element (header) and an empty list for t in tr_elements[0]: i+=1 name=t.text_content() col.append((name,[])) #Since out first row is the header, data is stored on the second row onwards for j in range(1,len(tr_elements)): #T is our j'th row T=tr_elements[j] #If row is not of size 10, the //tr data is not from our table if len(T)!=10: break #i is the index of our column i=0 #Iterate through each element of the row for t in T.iterchildren(): data=t.text_content() #Check if row is empty if i>0: #Convert any numerical value to integers try: data=int(data) except: pass #Append the data to the empty list of the i'th column col[i][1].append(data) #Increment i for the next column i+=1 # Creates the dataframe Dict={title:column for (title,column) in col} df=pd.DataFrame(Dict)

P J V E D GP GC SG Link 0 Palmeiras 80 38 23 11 4 64 26 38 https://especiais.gazetadopovo.com.br/futebol/times/palmeiras/ 1 Flamengo 72 38 21 9 8 59 29 30 https://especiais.gazetadopovo.com.br/futebol/times/flamengo/

1条回答

网友

1楼 · 发布于 2024-04-25 08:37:58

可以按以下方式绘制链接：

import re
import requests
import pandas as pd
import lxml.html as lh

response = requests.get('https://especiais.gazetadopovo.com.br/futebol/tabela-campeonato-brasileiro-2018')
links = re.findall('times link-time"><a href="(https:.*times.*)\"', response.text)
doc = lh.fromstring(response.content)
tr_elements = doc.xpath('//tr')
col = []
i = 0
for t in tr_elements[0]:
    i += 1
    name = t.text_content()
    col.append((name, []))

for j in range(1, len(tr_elements)):
    T = tr_elements[j]
    if len(T) != 10:
        break
    i = 0
    for t in T.iterchildren():
        data = t.text_content()
        if i > 0:
            try:
                data = int(data)
            except:
                pass
        col[i][1].append(data)
        i += 1

Dict = {title: column for (title, column) in col}
Dict['Link'] = links
df = pd.DataFrame(Dict)

最后我要说的是：

相关问题更多 >

编程相关推荐

热门问题

热门文章