如何限制在每隔几秒钟刷新一次的页面中捕获python beautifulsoup中的数据

from bs4 import BeautifulSoup from time import sleep import re, requests trim = re.compile(r'[^\d,.]+') url = "https://bscscan.com/txs?a=0x10ed43c718714eb63d5aa57b78b54704e256024e&ps=100&p=1" baseurl = 'https://bscscan.com/tx/' header = {"User-Agent": "Mozilla/5.0"} scans = 0 while True: scans += 1 reqtxsInternal = requests.get(url,header, timeout=2) souptxsInternal = BeautifulSoup(reqtxsInternal.content, 'html.parser') blocktxsInternal = souptxsInternal.findAll('table')[0].findAll('tr') for row in blocktxsInternal[1:]: txnhash = row.find_all('td')[1].text[0:] txnhashdetails = txnhash.strip() block = row.find_all('td')[3].text[0:] value = row.find_all('td')[9].text[0:] amount = trim.sub('', value).replace(",", "") transval = float(amount) if float(transval) >= 1: print ("Doing something with the data -> " + str(block) + " " + str(transval)) else: pass print (" -> Whole Page Scanned: ", scans) sleep(1)

Doing something with the data -> 10186993 1.233071907624764 Doing something with the data -> 10186993 4.689434542638692 Doing something with the data -> 10186993 27.97137792744322 #-- grab only until here and reload the scan Doing something with the data -> 10186992 9.0 Doing something with the data -> 10186991 2.98 Doing something with the data -> 10186991 1.0 -> Whole Page Scanned: 1 Doing something with the data -> 10186994 1.026868093169767 Doing something with the data -> 10186994 4.0 Doing something with the data -> 10186994 4.55582682 Doing something with the data -> 10186994 8.184713205161088 Doing something with the data -> 10186993 1.233071907624764 Doing something with the data -> 10186993 4.689434542638692 Doing something with the data -> 10186993 27.97137792744322 Doing something with the data -> 10186992 9.0 -> Whole Page Scanned: 2

Doing something with the data -> 10186993 1.233071907624764 Doing something with the data -> 10186993 4.689434542638692 Doing something with the data -> 10186993 27.97137792744322 -> Whole Page Scanned: 1 Doing something with the data -> 10186994 1.026868093169767 Doing something with the data -> 10186994 4.0 Doing something with the data -> 10186994 4.55582682 Doing something with the data -> 10186994 8.184713205161088 -> Whole Page Scanned: 2

2条回答

网友

1楼 · 编辑于 2024-06-09 23:24:56

只有当block数增加/减少时，连续性才起作用

由于每次刷新时数据都会发生变化，我建议先收集所需的数据，然后进行重复数据消除并执行所需的操作

from bs4 import BeautifulSoup
from time import sleep
import re, requests

trim = re.compile(r'[^\d,.]+')

url = "https://bscscan.com/txs?a=0x10ed43c718714eb63d5aa57b78b54704e256024e&ps=100&p=1"
baseurl = 'https://bscscan.com/tx/'
header = {"User-Agent": "Mozilla/5.0"}
scans = 0

all_data = set()
prev_block = 0
while True:
    scans += 1
    reqtxsInternal = requests.get(url,header, timeout=2)
    souptxsInternal = BeautifulSoup(reqtxsInternal.content, 'html.parser')
    blocktxsInternal = souptxsInternal.findAll('table')[0].findAll('tr')
    
    for row in blocktxsInternal[1:]:
        txnhash = row.find_all('td')[1].text[0:]
        txnhashdetails = txnhash.strip()
        block = int(row.find_all('td')[3].text[0:])
        value = row.find_all('td')[9].text[0:]
        amount = trim.sub('', value).replace(",", "")
        transval = float(amount)
        
        if (prev_block != 0) and (block < prev_block):
            # print(block, prev_block)
            continue
        else:
            prev_block = block

        if (block >= prev_block) and (transval >= 1):
            # print(block, prev_block)
            print("Do something with the data -> " + str(block) + " " + str(transval))
            
            # collect the data
            all_data.add((block, transval))
        else:
            pass
        
    print (" -> Whole Page Scanned: ", scans)
    sleep(1)
                
        
# do something with the data
print('Do something with this collected data:', all_data)

网友

2楼 · 编辑于 2024-06-09 23:24:56

我在这里使用了Pandas，因为它在引擎盖下使用了beautifulsoup，但因为它是一个表，所以我让pandas解析它。这样就很容易操纵桌子了

因此，它看起来像是您只需要最新的/max"Block"，然后返回任何大于或等于1的值。这能满足你的需要吗

import pandas as pd
from time import sleep
import requests

url = "https://bscscan.com/txs?a=0x10ed43c718714eb63d5aa57b78b54704e256024e&ps=100&p=1"
baseurl = 'https://bscscan.com/tx/'
header = {"User-Agent": "Mozilla/5.0"}
scans = 0

while True:
    scans += 1
    reqtxsInternal = requests.get(url,header, timeout=2)
    df = pd.read_html(reqtxsInternal.text)[0]
    df = df[df['Block'] == max(df['Block'])]
    df['Value'] = df['Value'].str.extract('(^\d*.*\d+)')
    df = df[df['Value'].astype(float) >= 1]
    
    print (df[['Block','Value']])
    print (" -> Whole Page Scanned: ", scans)
sleep(1)

您的另一个选项是让它检查当前的'block'是否大于上一个。然后将该逻辑添加到仅在以下情况下打印：

from bs4 import BeautifulSoup
from time import sleep
import re, requests

trim = re.compile(r'[^\d,.]+')

url = "https://bscscan.com/txs?a=0x10ed43c718714eb63d5aa57b78b54704e256024e&ps=100&p=1"
baseurl = 'https://bscscan.com/tx/'
header = {"User-Agent": "Mozilla/5.0"}
scans = 0
previous_block = 0
while True:
    scans += 1
    reqtxsInternal = requests.get(url,header, timeout=2)
    souptxsInternal = BeautifulSoup(reqtxsInternal.content, 'html.parser')
    blocktxsInternal = souptxsInternal.findAll('table')[0].findAll('tr')

    for row in blocktxsInternal[1:]:
        txnhash = row.find_all('td')[1].text[0:]
        txnhashdetails = txnhash.strip()
        block = row.find_all('td')[3].text[0:]
        if float(block) > float(previous_block):
            previous_block = block
        value = row.find_all('td')[9].text[0:]
        amount = trim.sub('', value).replace(",", "")
        transval = float(amount)
        
        if float(transval) >= 1 and block == previous_block:
            print ("Doing something with the data -> " + str(block) + "   " + str(transval))
        else:
            pass
    print (" -> Whole Page Scanned: ", scans)
sleep(1)

相关问题更多 >

编程相关推荐

热门问题

热门文章