from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ActionChains
import selenium.webdriver.common.keys
from bs4 import BeautifulSoup
import requests
import time
driver = webdriver.Chrome(executable_path="../drivers/chromedriver.exe")
driver.get("https://www.Here the address of the relevant website ends with aspx.com.aspx")
element=driver.find_element_by_id("ctl00_ContentPlaceHolder1_LB_SEKTOR")
drp=Select(element)
drp.select_by_index(0)
element1=driver.find_element_by_id("ctl00_ContentPlaceHolder1_Lb_Oran")
drp=Select(element1)
drp.select_by_index(41)
element2=driver.find_element_by_id("ctl00_ContentPlaceHolder1_LB_DONEM")
drp=Select(element2)
drp.select_by_index(1)
driver.find_element_by_id("ctl00_ContentPlaceHolder1_ImageButton1").click()
time.sleep(1)
print(driver.page_source)
在这些代码的最后一部分,我可以打印页面的源代码。这样我就可以打印出页面的源代码。 但在页面的源代码中,我只需要用java编写下表部分。如何提取此部分。我可以将csv输出为表格。(如何获取Java部分中的表。)
不是:在Selenium测试中,我想在Chrome中按CTRL-U键,但没有成功。该网页是一个用户交互页面。需要一些交互来获取我想要的数据。这就是我使用硒的原因
<span id="ctl00_ContentPlaceHolder1_Label2" class="Georgia_10pt_Red"></span>
<div id="ctl00_ContentPlaceHolder1_Divtable">
<div id="table">
<layer name="table" top="0"><IMG height="2" src="../images/spacer.gif" width="2"><br>
<font face="arial" color="#000000" size="2"><b>Tablo Yükleniyor. Lütfen Bekleyiniz...</b></font><br>
</layer>
</div>
</div>
<script language=JavaScript> var theHlp='/yardim/matris.asp';var theTitle = 'Piya Deg';var theCaption='OtomoT (TL)';var lastmod = '';var h='<a class=hislink href=../Hisse/Hisealiz.aspx?HNO=';var e='<a class=hislink href=../endeks/endeksAnaliz.aspx?HNO=';var d='<center><font face=symbol size=1 color=#FF0000><b>ß</b></font></center>';var u='<center><font face=symbol size=1 color=#008000><b>İ</b></font></center>';var n='<center><font face=symbol size=1 color=#00A000><b>=</b></font></center>';var fr='<font color=#FF0000>';var fg='<font color=#008000>';var theFooter=new Array();var theCols = new Array();theCols[0] = new Array('cksart',4,50);theCols[1] = new Array('2018.12',1,60);theCols[2] = new Array('2019.03',1,60);theCols[3] = new Array('2019.06',1,60);theCols[4] = new Array('2019.09',1,60);theCols[5] = new Array('2019.12',1,60);theCols[6] = new Array('2020.03',1,60);var theRows = new Array();theRows[0] = new Array ('<b>'+h+'42>AHRT</B></a>','519,120,000.00','590,520,000.00','597,240,000.00','789,600,000.00','1,022,280,000.00','710,640,000.00');
theRows[1] = new Array ('<b>'+h+'427>SEEL</B></a>','954,800,000.00','983,400,000.00','1,201,200,000.00','1,716,000,000.00','2,094,400,000.00','-');
theRows[2] = new Array ('<b>'+h+'140>TOFO</B></a>','17,545,500,000.00','17,117,389,800.00','21,931,875,000.00','20,844,054,000.00','24,861,973,500.00','17,292,844,800.00');
theRows[3] = new Array ('<b>'+h+'183>MSO</B></a>','768,000,000.00','900,000,000.00','732,000,000.00','696,000,000.00','1,422,000,000.00','1,134,000,000.00');
theRows[4] = new Array ('<b>'+h+'237>KURT</B></a>','2,118,000,000.00','2,517,600,000.00','2,736,000,000.00','3,240,000,000.00','3,816,000,000.00','2,488,800,000.00');
theRows[5] = new Array ('<b>'+h+'668>GRTY</B></a>','517,500,000.00','500,250,000.00','445,050,000.00','552,000,000.00','737,150,000.00','-');
theRows[6] = new Array ('<b>'+h+'291>MEME</B></a>','8,450,000,000.00','8,555,000,000.00','9,650,000,000.00','10,140,000,000.00','13,430,000,000.00','8,225,000,000.00');
theRows[7] = new Array ('<b>'+h+'292>AMMI</B></a>','-','-','-','-','-','-');
theRows[8] = new Array ('<b>'+h+'426>GOTE</B></a>','1,862,578,100.00','1,638,428,300.00','1,689,662,540.00','2,307,675,560.00','2,956,642,600.00','2,121,951,440.00');
var thetable=new mytable();thetable.tableWidth=650;thetable.shownum=false;thetable.controlaccess=true;thetable.visCols=new Array(true,true,true,true,true);thetable.initsort=new Array(0,-1);thetable.inittable();thetable.refreshTable();</script></form>
<div style="clear: both; margin-top: 10px;">
<div style="background-color: Red; border: 2px solid Green; display: none">
TABLO-ALT</div>
<div id="Bannerctl00_SiteBannerControl2">
<div id="_bannerctl00_SiteBannerControl2">
<div id="Sayfabannerctl00_SiteBannerControl2" class="banner_Codex">
</div>
请注意,我只在Java中使用了Selenium,因此我将给您提供最通用和最不懂语言的答案。请记住,Python Selenium可能会提供一种直接执行此操作的方法
步骤:
如果性能是一个要求,那么这种方法可能有点过于昂贵,因为内容会被解析两次:Selenium首先解析,而HTML解析器稍后将使用从Selenium提取的字符串再次解析
备选方案:如果您的“目标页面”使用AJAX,您可以直接与javascript正在访问的RESTAPI交互,以获取要填充的数据。我倾向于在进行严重的web抓取时采用这种方法,但有时这不是一种选择,所以我使用上述方法
编辑
更多细节基于评论中的问题:
您可以将BeautifullSoup用作html解析库
若要在中加载页面,请使用:
然后看看这个答案,看看如何从汤中提取特定的成分:
这将为您提供第一个具有事件水平id的div:
此代码基于:
How to use CSS selectors to retrieve specific links lying in some class using BeautifulSoup?
相关问题 更多 >
编程相关推荐