当没有更多的桌子可拉时,如何停止刮擦?

2024-03-29 02:16:24 发布

您现在位置:Python中文网/ 问答频道 /正文

当当前页面不再有任何pandas表可拉时,如何阻止for循环转到下一页

现在我对它进行了编码,以通过特定的pageRange+1,但不知道什么时候没有更多的页面需要解析

谢谢

# set variables for url to pull from
wmsLoggedIn = "https://enterurlhere/solution/entitylist.htm?" #root logged in page
pageSize = 300 # how many results per page
pageNo = 0 # starting page; starts at 0
entityName = "Shipment" # entity of the view
currentView = 51339 # view id to pull data from
pageRange = 2 # total pages returned on wms view

savefilesin = "G:\\webscraper\\Scraped Files\\"

# set variables for filename function
now = datetime.now()
randomstring = ''.join([random.choice(string.ascii_letters
                + string.digits) for n in range(6)])

def generate_filename():
    return str(entityName) + "-" + str(now.strftime("%m%d%Y-%H%M%S")) + "-" + str(randomstring) + ".csv"

modified_dfs = []
for pageNo in range(pageRange + int(1)):
    urlSpecifics = "entityName={entityName}&pageNo={pageNo}&pageSize={pageSize}&currentViewId={currentView}".format(entityName=entityName, pageNo=pageNo, pageSize=pageSize, currentView=currentView)
    # open entity page
    driver.get(wmsLoggedIn+urlSpecifics)
    html = driver.page_source
    for df in pd.read_html(html, attrs={"class":"roundedTable"}, header=5):
        df.dropna(how="all", axis="columns", inplace=True)
        df.drop({"No"}, axis="columns", inplace=True)
        df.dropna(how='all', axis=0, inplace=True)
        modified_dfs.append(df)
    pd.concat(modified_dfs).to_csv(savefilesin + generate_filename(), index=False)

elapsed_time = time.time() - start_time # capture total time it took for the process to run
print("Process Success and saved the file as: " + generate_filename() + " It took " + time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) + " to process!")
driver.quit()