如何在lxml中提取指定的div表格数据?
我有一个页面想用lxml来解析,表格的数据在你点击后会变成不同的形式。
from urllib.request import urlopen
import lxml.html
url="http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=sz300059"
material=urlopen(url).read()
root=lxml.html.parse(material)
如果我这样写,
set=root.xpath('//table[@id="BBMX_table"]//tr')
我能得到对应的表格数据。
<li class="first current" onclick="ChangeRptF10AssetStatement('30005902','8','All',this,'');">
我得到的是:

我想要得到的表格数据是对应的:
<li class="" onclick="ChangeRptF10AssetStatement('30005902','8','Year',this,'');">
我想要得到的是:
我该如何正确写我的xpath表达式 root.xpath
呢?更多信息:当你点击 按年度
时,表格会变成另一个表格。onclick="ChangeRptF10AssetStatement('30005902','8','Year',this,'')
我尝试过用selenium:
import lxml
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(options=chrome_options,executable_path='/usr/bin/chromedriver')
browser.get("http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=sz300059")
root = lxml.html.document_fromstring(browser.page_source)
mystring = lxml.etree.tostring(root, encoding = "unicode")
with open("/tmp/test.html","w") as fh:
fh.write(mystring)
打开 /tmp/test.html
,里面没有数据,我该如何获取我想要的数据呢?
5 个回答
*你不需要lxml库,下面的代码只用selenium就能获取所有表格数据。如果你一定要用lxml,可以在代码中使用xpaths,修改表格的id就能获取不同的表格值。 *
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(options=chrome_options,executable_path='chromedriver.exe')
browser.get("http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=sz300059")
allrows= len(browser.find_elements_by_xpath('//*[@id="report_zyzb"]/table/tbody/tr//span'))
for i in range(allrows):
allcolumns=browser.find_elements_by_xpath('//*[@id="report_zyzb"]/table/tbody/tr['+str(i)+']//span')
print('\n')
for j in range(len(allcolumns)):
print(allcolumns[j].text, end = " ")
获取你需要的数据有点复杂,因为你想要的表格是通过一个JavaScript函数加载的,这个函数会进行一个AJAX
请求。不过,好消息是,这些过程都是透明的,容易理解。
你需要做的步骤如下:
- 解析基础页面,找到我们感兴趣的
li
标签上合适的onclick
属性值。 - 使用正则表达式,获取第一个
ChangeRptF10AssetStatement()
函数的两个参数值——我们需要这些值来处理AJAX
响应。 - 向
http://f10.eastmoney.com/f10_v2/BackOffice.aspx
发起一个新的请求,传递几个参数:当前时间(从1970年开始的秒数)、我们已经获得的ChangeRptF10AssetStatement()
函数参数,以及其他一些固定参数。
下面是完整的可运行代码:
import calendar
import re
import time
from urllib.request import urlopen
import lxml.html
# define base url with placeholders for dynamic parameters
BASE_URL = "http://f10.eastmoney.com/f10_v2/BackOffice.aspx?timetip=%s000&command=RptF10AssetStatement¶mReportTime=Year¶mCode=%s¶mNum=%s"
PATTERN = re.compile("\('(\d+)','(\d+)',")
# parse initial page
url = "http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=sz300059"
root = lxml.html.parse(urlopen(url))
# extract js function arguments from `onclick` attribute value
li = root.xpath('//div[@class="content"][4]/div/ul/li')[1]
code, num = PATTERN.search(li.attrib['onclick']).groups()
# get the current time
seconds_since_epoch = calendar.timegm(time.gmtime())
# parse second page
url = BASE_URL % (seconds_since_epoch, code, num)
root = lxml.html.parse(urlopen(url))
# get table titles (just for example)
for row in root.xpath('//tr/th/span'):
print(row.text)
目前这段代码会打印出表格标题,只是为了向你展示它是有效的。
我们可以从他们的JSON请求中获取原始数据。在追踪发送到服务器的数据后,我找到了两个与年度报告表相关的请求:
http://f10.eastmoney.com/NewFinanceAnalysis/zcfzbDateAjax?reportDateType=1&code=SZ300059
还有一个:
第一个请求返回的是一个日期数组,这些日期用作第二个请求的分页参数,第二个请求则返回原始数据,用来生成表格。
表格的中文标题可以在HTML中找到,可以通过正则表达式提取出来。
下面的代码可以生成CSV格式的数据并输出到标准输出,你可以通过以下命令将其保存为文件:
python3 script.py > out.csv
import json
import csv
import sys
import re
import requests
def request_report_dates(code):
report_date_type = 1
url = (
"http://f10.eastmoney.com/NewFinanceAnalysis/zcfzbDateAjax?"
"reportDateType={report_date_type}&"
"code={code}".format(report_date_type=report_date_type, code=code)
)
resp = requests.get(url).json()
header = resp["data"]
return header
def request_reports(code, end_date=""):
company_type = 4
report_date_type = 1
report_type = 1
url = (
"http://f10.eastmoney.com/NewFinanceAnalysis/zcfzbAjax?"
"companyType={company_type}&"
"reportDateType={report_date_type}&"
"reportType={report_type}&"
"endDate={end_date}&"
"code={code}".format(
company_type=company_type,
report_date_type=report_date_type,
report_type=report_type,
end_date=end_date,
code=code,
)
)
resp = requests.get(url).json()
body = json.loads(resp)
return body
def request_zh_header_map():
url = "http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=sz300059"
html = requests.get(url).text
results = re.findall(
r'<span(?: style=".+")?>'
r'(?: )*'
r'([^{<]+)'
r'</span>'
r'(?:[\n\s])*?'
r'</td>'
r'[{}<>\sA-z"=-]+?'
r'<span>'
r'[\n\s{(\w]+'
r'([\w.]+)'
r'[,\d\n\s\w})]+'
r'</span>'
r'</td>',
html,
)
header_map = dict()
for key_zh, key_en in results:
key_en = key_en.lstrip('.')
header_map[key_en] = key_zh
return header_map
def get_next_report_date(report_dates, report):
try:
idx = report_dates.index(report[-1]["REPORTDATE"].split()[0].replace("/", "-"))
if idx != len(report_dates) - 1:
return report_dates[idx]
else:
return None
except ValueError:
return None
def main():
code = "SZ300059"
reports = []
report_dates = request_report_dates(code)
next_report_date = ""
header_map = request_zh_header_map()
while True:
report = request_reports(code, next_report_date)
reports += report
next_report_date = get_next_report_date(report_dates, report)
if not next_report_date:
break
#
cw = csv.writer(sys.stdout)
headers = [header_map.get(header, header) for header in reports[0].keys()]
cw.writerow(headers)
for r in reports:
cw.writerow(list(r.values()))
if __name__ == "__main__":
main()
这里是你用代码得到的HTML,表格是在一个script标签里面:
<script type="text/template" id="tmpl_zyzb">
{{if (zyzb==null||zyzb.length<=0)}}
<div>
暂无数据
</div>
{{else}}
<table>
<tbody>
<tr>
<th class="tips-colname-Left">
<span>每股指标</span>
</th>
{{each zyzb as value i}}
<th class="tips-fieldname-Right" data-value="{{value.date}}">
<span>{{value.date.substr(2,8)}}</span>
</th>
{{/each}}
</tr>
要使用Selenium,你需要等页面加载完成。下面的代码里有一个使用WebDriverWait
的例子:
from selenium.webdriver.support.ui import WebDriverWait
browser.get("http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=sz300059")
WebDriverWait(browser, 10).until(lambda d: d.execute_script(
'return ["complete", "interactive"].indexOf(document.readyState) != -1'))
root = lxml.html.document_fromstring(browser.page_source)
print(root.xpath("//*[@class='name']//strong")[0].text)
print(root.xpath("//div[@id='report_zyzb']//th//span")[0].text)
你可以通过API请求获取页面上所有表格的信息。第一个主要指标部分的每个标签都有一个不同的URL,里面有不同的type
参数(0、1和2)。同样的方法也适用于其他表格:
import requests
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Accept': '*/*',
'DNT': '1',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
'Referer': 'http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=sz300059',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ru,en-US;q=0.9,en;q=0.8,tr;q=0.7',
}
# Section: 主要指标
section_url = 'http://f10.eastmoney.com/NewFinanceAnalysis/MainTargetAjax'
data_code = 'SZ300059'
with requests.Session() as s:
# 按报告期
data_type = 0
response = s.get(f'{section_url}?type={data_type}&code={data_code}', headers=headers, verify=False)
print(response.text)
# 按年度
data_type = 1
response = s.get(f'{section_url}?type={data_type}&code={data_code}', headers=headers, verify=False)
print(response.text)
# 按单季度
data_type = 2
response = s.get(f'{section_url}?type={data_type}&code={data_code}', headers=headers, verify=False)
print(response.text)
Json响应(格式化后):
[
{
"date":"2018-12-31",
"jbmgsy":"0.1858",
"kfmgsy":"0.1836",
"xsmgsy":"0.1858",
"mgjzc":"2.8010",
"mggjj":"1.0650",
"mgwfply":"0.6603",
"mgjyxjl":"0.5161",
"yyzsr":"31.2亿",
"mlr":"8.51亿",
"gsjlr":"9.59亿",
"kfjlr":"9.47亿",
"yyzsrtbzz":"22.64",
"gsjlrtbzz":"50.52",
"kfjlrtbzz":"53.87",
"yyzsrgdhbzz":"-1.17",
"gsjlrgdhbzz":"-2.19",
"kfjlrgdhbzz":"-0.92",
"jqjzcsyl":"6.32",
"tbjzcsyl":"6.11",
"tbzzcsyl":"2.35",
"mll":"27.25",
"jll":"30.68",
"sjsl":"14.51",
"yskyysr":"0.10",
"xsxjlyysr":"1.12",
"jyxjlyysr":"2.04",
"zzczzy":"0.08",
"yszkzzts":"80.72",
"chzzts":"139.58",
"zcfzl":"60.58",
"ldzczfz":"79.99",
"ldbl":"1.78",
"sdbl":"1.77"
},
{
"date":"2017-12-31",
"jbmgsy":"0.1488",
"kfmgsy":"0.1438",
"xsmgsy":"0.1488",
"mgjzc":"3.1381",
"mggjj":"1.4559",
"mgwfply":"0.6116",
"mgjyxjl":"-1.4363",
"yyzsr":"25.5亿",
"mlr":"4.75亿",
"gsjlr":"6.37亿",
"kfjlr":"6.16亿",
"yyzsrtbzz":"8.29",
"gsjlrtbzz":"-10.77",
"kfjlrtbzz":"3.43",
"yyzsrgdhbzz":"7.48",
"gsjlrgdhbzz":"6.80",
"kfjlrgdhbzz":"9.79",
"jqjzcsyl":"4.86",
"tbjzcsyl":"4.34",
"tbzzcsyl":"1.84",
"mll":"18.64",
"jll":"24.93",
"sjsl":"6.51",
"yskyysr":"0.12",
"xsxjlyysr":"1.05",
"jyxjlyysr":"-5.54",
"zzczzy":"0.07",
"yszkzzts":"98.08",
"chzzts":"125.67",
"zcfzl":"64.92",
"ldzczfz":"80.05",
"ldbl":"1.67",
"sdbl":"1.66"
},
{
"date":"2016-12-31",
"jbmgsy":"0.2059",
"kfmgsy":"0.1717",
"xsmgsy":"0.2059",
"mgjzc":"3.6042",
"mggjj":"1.9186",
"mgwfply":"0.6112",
"mgjyxjl":"-1.1882",
"yyzsr":"23.5亿",
"mlr":"6.47亿",
"gsjlr":"7.14亿",
"kfjlr":"5.95亿",
"yyzsrtbzz":"-19.62",
"gsjlrtbzz":"-61.39",
"kfjlrtbzz":"-66.86",
"yyzsrgdhbzz":"-1.13",
"gsjlrgdhbzz":"-24.72",
"kfjlrgdhbzz":"-26.92",
"jqjzcsyl":"6.60",
"tbjzcsyl":"5.57",
"tbzzcsyl":"2.81",
"mll":"27.49",
"jll":"30.29",
"sjsl":"10.74",
"yskyysr":"0.11",
"xsxjlyysr":"1.04",
"jyxjlyysr":"-3.51",
"zzczzy":"0.09",
"yszkzzts":"90.54",
"chzzts":"75.18",
"zcfzl":"52.45",
"ldzczfz":"97.77",
"ldbl":"1.56",
"sdbl":"1.55"
},
{
"date":"2015-12-31",
"jbmgsy":"1.0897",
"kfmgsy":"1.0585",
"xsmgsy":"1.0897",
"mgjzc":"4.4066",
"mggjj":"2.3754",
"mgwfply":"0.9065",
"mgjyxjl":"0.2953",
"yyzsr":"29.3亿",
"mlr":"20.5亿",
"gsjlr":"18.5亿",
"kfjlr":"18.0亿",
"yyzsrtbzz":"378.08",
"gsjlrtbzz":"1015.45",
"kfjlrtbzz":"1002.51",
"yyzsrgdhbzz":"13.62",
"gsjlrgdhbzz":"17.11",
"kfjlrgdhbzz":"14.51",
"jqjzcsyl":"66.42",
"tbjzcsyl":"22.63",
"tbzzcsyl":"12.36",
"mll":"70.05",
"jll":"63.18",
"sjsl":"14.85",
"yskyysr":"0.07",
"xsxjlyysr":"0.98",
"jyxjlyysr":"0.19",
"zzczzy":"0.20",
"yszkzzts":"27.67",
"chzzts":"--",
"zcfzl":"65.55",
"ldzczfz":"96.64",
"ldbl":"1.31",
"sdbl":"1.31"
},
{
"date":"2014-12-31",
"jbmgsy":"0.1370",
"kfmgsy":"0.1346",
"xsmgsy":"0.1370",
"mgjzc":"1.5540",
"mggjj":"0.2420",
"mgwfply":"0.2640",
"mgjyxjl":"1.9535",
"yyzsr":"6.12亿",
"mlr":"1.94亿",
"gsjlr":"1.66亿",
"kfjlr":"1.63亿",
"yyzsrtbzz":"146.31",
"gsjlrtbzz":"3213.59",
"kfjlrtbzz":"--",
"yyzsrgdhbzz":"39.62",
"gsjlrgdhbzz":"82.92",
"kfjlrgdhbzz":"90.55",
"jqjzcsyl":"9.38",
"tbjzcsyl":"8.82",
"tbzzcsyl":"3.85",
"mll":"31.68",
"jll":"27.07",
"sjsl":"16.01",
"yskyysr":"0.22",
"xsxjlyysr":"1.08",
"jyxjlyysr":"3.86",
"zzczzy":"0.14",
"yszkzzts":"45.05",
"chzzts":"--",
"zcfzl":"69.60",
"ldzczfz":"99.89",
"ldbl":"1.38",
"sdbl":"1.38"
},
{
"date":"2013-12-31",
"jbmgsy":"0.0100",
"kfmgsy":"-0.0039",
"xsmgsy":"0.0100",
"mgjzc":"2.5136",
"mggjj":"1.1785",
"mgwfply":"0.2745",
"mgjyxjl":"0.7084",
"yyzsr":"2.48亿",
"mlr":"-339万",
"gsjlr":"500万",
"kfjlr":"-262万",
"yyzsrtbzz":"11.57",
"gsjlrtbzz":"-86.69",
"kfjlrtbzz":"-108.51",
"yyzsrgdhbzz":"28.64",
"gsjlrgdhbzz":"--",
"kfjlrgdhbzz":"--",
"jqjzcsyl":"0.29",
"tbjzcsyl":"0.30",
"tbzzcsyl":"0.24",
"mll":"-1.36",
"jll":"2.01",
"sjsl":"-0.42",
"yskyysr":"0.39",
"xsxjlyysr":"0.94",
"jyxjlyysr":"1.92",
"zzczzy":"0.12",
"yszkzzts":"62.86",
"chzzts":"--",
"zcfzl":"30.57",
"ldzczfz":"99.25",
"ldbl":"3.02",
"sdbl":"3.02"
},
{
"date":"2012-12-31",
"jbmgsy":"0.1100",
"kfmgsy":"0.0900",
"xsmgsy":"0.1100",
"mgjzc":"5.1175",
"mggjj":"3.3624",
"mgwfply":"0.6399",
"mgjyxjl":"0.0600",
"yyzsr":"2.23亿",
"mlr":"3533万",
"gsjlr":"3758万",
"kfjlr":"3074万",
"yyzsrtbzz":"-20.55",
"gsjlrtbzz":"-64.72",
"kfjlrtbzz":"-68.18",
"yyzsrgdhbzz":"-12.07",
"gsjlrgdhbzz":"-45.99",
"kfjlrgdhbzz":"-50.55",
"jqjzcsyl":"2.20",
"tbjzcsyl":"2.19",
"tbzzcsyl":"2.07",
"mll":"15.86",
"jll":"16.88",
"sjsl":"13.29",
"yskyysr":"0.27",
"xsxjlyysr":"0.77",
"jyxjlyysr":"0.09",
"zzczzy":"0.12",
"yszkzzts":"56.91",
"chzzts":"--",
"zcfzl":"4.54",
"ldzczfz":"97.80",
"ldbl":"20.02",
"sdbl":"20.02"
},
{
"date":"2011-12-31",
"jbmgsy":"0.5100",
"kfmgsy":"0.4600",
"xsmgsy":"0.5100",
"mgjzc":"8.1000",
"mggjj":"5.9674",
"mgwfply":"0.9669",
"mgjyxjl":"0.7431",
"yyzsr":"2.80亿",
"mlr":"1.10亿",
"gsjlr":"1.07亿",
"kfjlr":"9661万",
"yyzsrtbzz":"51.55",
"gsjlrtbzz":"59.62",
"kfjlrtbzz":"35.11",
"yyzsrgdhbzz":"12.27",
"gsjlrgdhbzz":"11.64",
"kfjlrgdhbzz":"4.62",
"jqjzcsyl":"6.44",
"tbjzcsyl":"6.27",
"tbzzcsyl":"6.08",
"mll":"39.14",
"jll":"38.01",
"sjsl":"12.25",
"yskyysr":"0.39",
"xsxjlyysr":"1.12",
"jyxjlyysr":"0.56",
"zzczzy":"0.16",
"yszkzzts":"38.93",
"chzzts":"--",
"zcfzl":"6.76",
"ldzczfz":"100.00",
"ldbl":"13.13",
"sdbl":"13.13"
},
{
"date":"2010-12-31",
"jbmgsy":"0.5100",
"kfmgsy":"0.5400",
"xsmgsy":"0.5100",
"mgjzc":"11.5200",
"mggjj":"9.4387",
"mgwfply":"0.9209",
"mgjyxjl":"0.4991",
"yyzsr":"1.85亿",
"mlr":"7032万",
"gsjlr":"6674万",
"kfjlr":"7150万",
"yyzsrtbzz":"12.01",
"gsjlrtbzz":"-7.13",
"kfjlrtbzz":"6.78",
"yyzsrgdhbzz":"1.73",
"gsjlrgdhbzz":"-10.81",
"kfjlrgdhbzz":"0.68",
"jqjzcsyl":"5.27",
"tbjzcsyl":"4.14",
"tbzzcsyl":"6.67",
"mll":"38.02",
"jll":"36.10",
"sjsl":"9.82",
"yskyysr":"0.37",
"xsxjlyysr":"1.19",
"jyxjlyysr":"0.38",
"zzczzy":"0.18",
"yszkzzts":"50.99",
"chzzts":"--",
"zcfzl":"4.09",
"ldzczfz":"100.00",
"ldbl":"23.80",
"sdbl":"23.80"
}
]
当你在爬取一个网站时,可能会引发一些不必要的问题。
确保你爬取的网站没有禁止你这样做。如果他们说不允许爬取网站,你就应该尊重这个规定。
我看到你的代码中使用了selenium,并且输出了html文件:
更新:为了创建稳定的代码,按照Sers的建议,应该优化等待网站元素加载完成的方法。我调整了代码,如下所示:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(options=chrome_options,
executable_path=r'F:\chromedriver.exe')
wait = WebDriverWait(browser, 20)
list_stock = ['sz300059', 'sz300766', 'sz002950']
try:
for id_stock in list_stock:
url_id = "http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=" + id_stock
browser.get(url_id)
# click to element 按年度 (Per year)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
wait.until(EC.presence_of_all_elements_located([By.CSS_SELECTOR, "#zyzbTab > li:nth-child(2)"]))
element_per_year = browser.find_element_by_css_selector('#zyzbTab > li:nth-child(2)')
element_per_year.click()
# get table
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
wait.until(EC.presence_of_all_elements_located([By.CSS_SELECTOR, "#report_zyzb"]))
# time.sleep(5)
element_tb_per_year = browser.find_element_by_css_selector('#report_zyzb')
tb_per_year_html = element_tb_per_year.get_attribute('innerHTML')
path_file_html = fr'F:\test_{id_stock}.html'
with open(path_file_html, "w", encoding='utf-8') as fh:
fh.write(tb_per_year_html)
print(f'export id: {id_stock}')
except TimeoutException:
print("Timed out waiting for page to load")
finally:
browser.close()
browser.quit()
当WebDriverWait工作不正常时,我认为你应该使用time.sleep。你可以在网上搜索更多相关信息。
这是图片: