从Python Beautifulsoup提取的表数据中对结果进行分组,以提高可读性

2024-05-16 14:43:50 发布

您现在位置:Python中文网/ 问答频道 /正文

下面的代码段正在运行,但为了可读性,我需要帮助将结果格式化到屏幕中

from urllib.request import Request, urlopen,urljoin
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url, tds[8].find('a')['href'])
    print(str(link)[26:] +"\t" + str(token) + "\t\t" + str(value))

电流输出:

0x154a9f9cbd3449ad22fdae23044319d6ef2a1fab   CryptoBlades... (SKILL)    0
0x46d502fac9aea7c5bc7b13c8ec9d02378c33d36f   WolfSafePoor... (WSPP)     532,654,321,110
0xb510e39a6cc3ebe999ff957ae7b5813d3326af88   GoldenBresco (GoBo)        0.1
0xbb4cdb9cbd36b01bd1cbaebf2de08d9173bc095c   Wrapped BNB (WBNB)         0.193446389516094066
0xb510e39a6cc3ebe999ff957ae7b5813d3326af88   GoldenBresco (GoBo)        0.003

需要改进:#分为3列

0x154a9f9cbd3449ad22fdae23044319d6ef2a1fab   CryptoBlades... (SKILL)    2.746949883778173559
                                             CryptoBlades... (SKILL)    0.971749999999999991
                                             CryptoBlades... (SKILL)    0

0xbb4cdb9cbd36b01bd1cbaebf2de08d9173bc095c   Wrapped BNB (WBNB)         0.1
                                             Wrapped BNB (WBNB)         0.193446389516094066
                                             Wrapped BNB (WBNB)         0.3

Tags: importmozillachromeskillsafarilikeagentheader
2条回答

尝试:

import requests
from bs4 import BeautifulSoup
from itertools import groupby

url = "https://bscscan.com/tokentxns"

soup = BeautifulSoup(requests.get(url).content, "html.parser")

data = []
for tr in soup.select("tr:has(td)"):
    tds = [td.get_text(strip=True) for td in tr.select("td")]
    _, txn_hash, tm, age, from_, _, to_, value, token = tds
    data.append((txn_hash, token, value))

data = sorted(data)
for _, g in groupby(data, lambda k: k[0]):
    g = list(map(list, g))
    for subl in g[1:]:
        subl[0] = ""

    for subl in g:
        print("{:<67} {:<27} {:<20}".format(*subl))
    print()

印刷品:

0x0883f7ada1e30d266366577dbc46cd86a8deb737d669758a443ef03859ea551a  FEGtoken (FEG)              1,946,201,644.40754275
                                                                    Wrapped BNB (WBNB)          0.025356409113673479

0x41a7e28aa1f88522ba477718f9ea93d927bd8c456cd77c75691d961ac01da626  KOMOCOIN (KMC)              1,500               
                                                                    KOMOCOIN (KMC)              750                 

0x54bf03ddb42a151920fc2352a8419ed24720422b79c4956c74ab1d51aead142e  BABY CAKE (BABYCA...)       140.806276687606518422
                                                                    BABY CAKE (BABYCA...)       165.654443161890021673
                                                                    BABY CAKE (BABYCA...)       2,164,578.319665288243959287
                                                                    BABY CAKE (BABYCA...)       238.930554998160499529
                                                                    BABY CAKE (BABYCA...)       42.164215587910676387
                                                                    BABY CAKE (BABYCA...)       462,482.805614060076081865
                                                                    BABY CAKE (BABYCA...)       797.902234563103604395
                                                                    BABY CAKE (BABYCA...)       938.708511250710122817
                                                                    BABY CAKE PR...(BBCAKE...)  190,322,532.495690243057683413
                                                                    BABY CAKE PR...(BBCAKE...)  2,526,729.458161278746350005
                                                                    BABY CAKE PR...(BBCAKE...)  251,979.604709746169304594
                                                                    BABY CAKE PR...(BBCAKE...)  252,609.914806456810514054
                                                                    BABY CAKE PR...(BBCAKE...)  36,251,910.951560046296701602
                                                                    BABYCAKE_Div...(BABYCA...)  238.930554998160499529
                                                                    Pancake LPs (Cake-L...)     0.222139817418176568
                                                                    Pancake LPs (Cake-L...)     13.786493105169560097
                                                                    Pancake LPs (Cake-L...)     486.96534350290155168
                                                                    Pancake LPs (Cake-L...)     5.76850094907955108 
                                                                    PancakeSwap ...(Cake)       0.001286990618481616
                                                                    PancakeSwap ...(Cake)       0.112893929385320841
                                                                    PancakeSwap ...(Cake)       1.497338191475435628
                                                                    PancakeSwap ...(Cake)       61.821404790611192339
                                                                    PancakeSwap ...(Cake)       61.821404790611192339
                                                                    Wrapped BNB (WBNB)          0.000146050638113703
                                                                    Wrapped BNB (WBNB)          0.000146050638113703
                                                                    Wrapped BNB (WBNB)          0.00146079350317574 
                                                                    Wrapped BNB (WBNB)          0.109629866733835175
                                                                    Wrapped BNB (WBNB)          0.610745057130530703
                                                                    Wrapped BNB (WBNB)          2.850122532653068215

0x6cc6153aa387de6a56c905f7d424ec38f047fefdcc2b7d766c53db7807b6f562  CryptoBlades...(SKILL)      0.005999999999999999
                                                                    CryptoBlades...(SKILL)      0.06                

0x776a1edc9446cc3e160cb08a69e2824dab0e6df7b6c79f252a1c9a0de4733bd4  Arena Token (ARENA)         0.000802589119468346
                                                                    Arena Token (ARENA)         0.037402597402597402
                                                                    Arena Token (ARENA)         0.374025974025974025

0x7ca15e96d56d686d79a93271e192021fefed01187dce424bec835f1a6a47b937  CryptoBlades...(SKILL)      0.971749999999999991

0x7f6bada297def57a2d1823000d464923187bea376c5747ba6ebe0b63b1ae1850  CryptoBlades...(SKILL)      0                   

0x8ddaceff011648b2f13128c8ce4ff5654171878200e12f2ce8f9cf3ec4ab97a3  CryptoBlades...(SKILL)      0.051999999999999999
                                                                    CryptoBlades...(SKILL)      0.52                

0x91d299dc263ac4e30027c5e54e5a5fd4fd2fb814db7c0fc00643764f8710e47b  CryptoBlades...(SKILL)      0                   

0xa097fad173e3d6551e2a837048f40348ffcafc710ca13410de1fb532f2833ba7  Niubi Token (NIU)           2,152.08364390963091904
                                                                    Wrapped BNB (WBNB)          0.05                

0xf2c10ec09049cd810c3aac459b85b9bbbcbb53f3b78341d24af1cab585d6e1ba  Foxy Equilib...(Foxy)       0.9                 
                                                                    Foxy Equilib...(Foxy)       0.9                 
                                                                    Foxy Equilib...(Foxy)       7.2                 

0xf5b44e82e4e4509d59b51491ce1bfa44888fae2c11a65bd5021d2aed9c75afd4  CryptoBlades...(SKILL)      0.055005280975673767
                                                                    Wrapped BNB (WBNB)          0.022533425242910644


编辑:要打印令牌URL而不是名称,请执行以下操作:

import requests
from bs4 import BeautifulSoup
from itertools import groupby

url = "https://bscscan.com/tokentxns"

soup = BeautifulSoup(requests.get(url).content, "html.parser")

data = []
for tr in soup.select("tr:has(td)"):
    tds = [td.get_text(strip=True) for td in tr.select("td")]
    _, txn_hash, tm, age, from_, _, to_, value, token = tds
    a = "https://bscscan.com" + tr.select("a")[-1]["href"]
    data.append((txn_hash, a, value))

data = sorted(data)
for _, g in groupby(data, lambda k: k[0]):
    g = list(map(list, g))
    for subl in g[1:]:
        subl[0] = ""

    for subl in g:
        print("{:<67} {:<27} {:<20}".format(*subl))
    print()

关于pastebin,我没有得到您的回复,所以这里是我给出的简单设计数据帧的方法。我在评论中说,似乎你真的只想按第一列排序,然后不要在该列中重复项目。您可以使用sort_values()执行此操作,并使用duplicated()将重复项替换为''。我借用了Andrej(向上投票)的整洁语法来填充列表列表

您可以根据需要设置数据框的样式。例如,我隐藏了单元格之间的边框,并将背景设置为白色

import pandas as pd
from bs4 import BeautifulSoup
import requests, random

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
df_rows = []

for tr in soup.select("tr:has(td)"):
    tds = [td.get_text(strip=True) for td in tr.select("td")]
    _, txnhash, tm, age, from_, _, to_, value, token = tds
    df_rows.append([txnhash, token, value])
  
df = pd.DataFrame(df_rows, columns = ['hash',  'token', 'value'])
df['value'] = pd.to_numeric(df['value'].apply(lambda x: x.replace(',','')))
df.sort_values(['hash', 'token'], inplace = True)
df.hash = [i[1] if not i[0] else '' for i in zip(df.duplicated(subset=['hash']), df.hash)]
#df.reset_index(drop = True, inplace = True)
df.style.format(formatter={('value'): "{:,.3f}"}).hide_index() \
  .set_properties(**{'background-color': 'white', 'text-align': 'left'}, padding="10px", border='0px solid white') \
  .set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

相关问题 更多 >