我们如何导航到一个网页，刮取数据，移动到下一页，然后再做一次？

# first attempt import requests from bs4 import BeautifulSoup import pandas as pd from selenium import webdriver from time import sleep lst = [] url = "https://www.nasdaq.com/market-activity/stocks/screener" for numb in (1, 10): url = "https://www.nasdaq.com/market-activity/stocks/screener" r = requests.get(url) html = r.text soup = BeautifulSoup(html, "html.parser") table = soup.find_all('table') df = pd.DataFrame(table) lst.append(df) def get_cpf(): driver = webdriver.Chrome("C:/Utility/chromedriver.exe") driver.get(url) driver.find_element_by_class('pagination__page" data-page="'' + numb + ''').click() sleep(10) text=driver.find_element_by_id('texto_cpf').text print(text) get_cpf() get_cpf.click ### second attempt #import BeautifulSoup from bs4 import BeautifulSoup import pandas as pd import requests from selenium import webdriver from time import sleep lst = [] for numb in (1, 10): r=requests.get('https://www.nasdaq.com/market-activity/stocks/screener') data = r.text soup = BeautifulSoup(data, "html.parser") table = soup.find( "table", {"class":"nasdaq-screener__table"} ) for row in table.findAll("tr"): for cell in row("td"): data = cell.get_text().strip() df = pd.DataFrame(data) lst.append(df) def get_cpf(): driver = webdriver.Chrome("C:/Utility/chromedriver.exe") driver.get(url) driver.find_element_by_class('pagination__page" data-page="'' + numb + ''').click() sleep(10) text=driver.find_element_by_id('texto_cpf').text print(text) get_cpf() get_cpf.click ### third attempt from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium import webdriver import time import requests import pandas as pd lst = [] url="https://www.nasdaq.com/market-activity/stocks/screener" driver = webdriver.Chrome("C:/Utility/chromedriver.exe") wait = WebDriverWait(driver, 10) driver.get(url) wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#_evh-ric-c"))).click() for pages in range(1,9): try: print(pages) r = requests.get(url) html = r.text soup = BeautifulSoup(html, "html.parser") table = soup.find_all('table') df = pd.DataFrame(table) lst.append(df) wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button.pagination__next"))).click() time.sleep(1) except: break

2条回答

网友

1楼 · 编辑于 2024-05-15 03:40:56

不会处理API，因为Nuran只会按照用户的要求处理

下面是浏览前10页的示例。首先，我们删除通知。然后等待“下一步”按钮可交互并单击它

wait = WebDriverWait(driver, 10)
driver.get("https://www.nasdaq.com/market-activity/stocks/screener")
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#_evh-ric-c"))).click()
#Currently you start on the 1st page and say we want to click 9 times for the 10th page
for pages in range(1,10):
    try:
        print(pages)
        #Get your data from this page
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button.pagination__next"))).click()
        #This is just here to slow everything so it may be removeable. 
        time.sleep(5)
    except:
        break

进口

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC

你可以这样做

html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
div=soup.select_one("table.nasdaq-screener__table")
table=pd.read_html(str(div))
print(table[0])

网友

2楼 · 编辑于 2024-05-15 03:40:56

请注意，，您不需要使用selenium来执行此类任务，因为它会降低您的进程

在真实场景中，我们只使用selenium绕过浏览器检测，然后将cookie传递给任何HTTP模块以继续操作

关于您的任务，我注意到有一个API实际上为HTML源提供了信息

这是一个快速呼叫

import pandas as pd
import requests


def main(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
    }

    params = {
        'tableonly': 'true',
        'limit': 1000
    }
    r = requests.get(
        'https://api.nasdaq.com/api/screener/stocks', params=params, headers=headers)
    goal = pd.DataFrame(r.json()['data']['table']['rows'])
    print(goal)

    goal.to_csv('data.csv', index=False)


if __name__ == "__main__":
    main('https://api.nasdaq.com/api/screener/stocks')

Note that each page contain 25 ticker. Within my code, I've fetched 1000/ 25 = 40 Pages.

您不需要在这里循环pages。因为你可以通过增加限制来互动

但是如果您想使用for循环，那么您必须循环以下内容

并保持偏移量

https://api.nasdaq.com/api/screener/stocks?tableonly=true&limit=25&offset=0

相关问题更多 >

编程相关推荐

热门问题

热门文章