我试图点击展开按钮，然后刮桌子

from selenium import webdriver import pandas as pd from bs4 import BeautifulSoup from urllib.request import urlopen import requests import csv driver = webdriver.Chrome() url_file = "csrdata.txt" with open(url_file, "r") as url: url_pages = url.read() # we need to split each urls into lists to make it iterable pages = url_pages.split("\n") # Split by lines using \n data = [] # now we run a for loop to visit the urls one by one for single_page in pages: driver.get(single_page) r = requests.get(single_page) soup = BeautifulSoup(r.content, 'html5lib') driver.find_element_by_link_text("CSR Details of FY 2017-18").click() table = driver.find_elements_by_xpath("//*[contains(@id,'colfy4')]") about = table.__getitem__(0).text x = about.split('\n') print(x) data.append(x) df = pd.DataFrame(data) print(df) # write to csv df.to_csv('csr.csv')

1条回答

网友

1楼 · 发布于 2024-05-23 19:14:12

您不需要使用selenium，因为所有信息都在html代码中。您还可以使用pandas inbuild函数pd_read_html()将html表直接转换为数据帧

data = []
for single_page in pages:
    r = requests.get(single_page)
    soup = BeautifulSoup(r.content, 'html5lib')

    table = soup.find_all('table')               #finds all tables
    table_top = pd.read_html(str(table))[0]      #the top table

    try:                                         #try to get the other table if exists
        table_extra = pd.read_html(str(table))[7]
    except:
        table_extra = pd.DataFrame()            
    
    result = pd.concat([table_top, table_extra])
    data.append(result)

pd.concat(data).to_csv('test.csv')

输出：

                            0                          1
0                       Class                     Public
1                       State                 Chandigarh
2                Company Type           Other than Govt.
3                         RoC             RoC-Chandigarh
4                Sub Category  Company limited by shares
5              Listing Status                     Listed
0          Average Net Profit                          0
1  CSR Prescribed Expenditure                          0
2                   CSR Spent                          0
3            Local Area Spent                          0

相关问题更多 >

编程相关推荐

热门问题

热门文章