如何爬取指定页面的数据?

2024-04-24 14:30:47 发布

您现在位置:Python中文网/ 问答频道 /正文

我不知道网页的长度,所以我需要刮到某一页的数据。我已经写了代码,但代码是刮数据,直到网站的最后一页。我想把这一页分成我需要的那一页。 网站=https://worldwide.espacenet.com/classification# 我需要这个网页https://worldwide.espacenet.com/classification#!/CPC=A99Z99/00

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import bs4
import requests
import time
import itertools
import json
from pymongo import MongoClient
import pandas as pd
client = MongoClient()
db = client.espace1
espace1 = db.espacedata1

driver = webdriver.Chrome(executable_path = r'C:\Program Files 
(x86)\Google\Chrome\Application\chromedriver_win32 (2)\chromedriver.exe')
url = ('https://worldwide.espacenet.com/classification')
driver.get(url)
time.sleep(10)

div_class_tree = driver.find_element_by_class_name('class-tree')
div_ul = div_class_tree.find_element_by_tag_name('ul')
div_lis = div_ul.find_elements_by_tag_name('li')
for div_li in div_lis:
pass #print (div_li)
span_nav_next = driver.find_element_by_class_name('cpcbrowser-nav-next')
span_nav_next.click()
time.sleep(10)
while True:
div_class_tree =driver.find_element_by_class_name('class-tree')
div_ul = div_class_tree.find_element_by_tag_name('ul')
div_li = div_ul.find_element_by_tag_name('li')
div_li_ul = div_li.find_element_by_tag_name('ul')
div_li_ul_lis = div_li_ul.find_elements_by_tag_name('li')
data = []
for div_ul_li in div_li_ul_lis:
    data.append(div_ul_li.text)
    df = pd.DataFrame(data)
    records = df.to_dict()
print(records)

span_nav_next = driver.find_element_by_class_name('cpcbrowser-nav-next')
span_nav_next.click()
time.sleep(10)
    results = espace1.insert_many([{'records':data}])

Tags: nameimportdivtreebytimetagdriver