使用Selenium慢速爬虫
我正在尝试使用Selenium来抓取网站的数据,但速度非常慢。每条记录都要花费一分钟。
这个网页是 https://jamabandi.nic.in/land%20records/NakalRecord。我想抓取每一条记录。
所以有没有其他的办法呢?我可以使用什么API接口或者HTTPS请求吗?
我的代码是
di=11
district_xpath_new = (By.XPATH, district_xpath)
dropdown_district=Select(handle_stale_element_reference(driver, district_xpath_new))
dropdown_district.select_by_index(di)
total_districts=len(Select(handle_stale_element_reference(driver, district_xpath_new)).options)
while(di<(total_districts)):
time.sleep(5)
driver,district_name=district_func(driver,di,district_xpath)
print("District Started "+str(di))
te=1
driver,dropdown_tehsil,total_tehsils,tehsil_name=tehsil_func(driver,te,tehsil_xpath)
# dropdown_tehsil.select_by_index(te)
while(te<total_tehsils):
time.sleep(5)
print("Tehsil Started is"+str(te))
driver,dropdown_tehsil,total_tehsils,tehsil_name=tehsil_func(driver,te,tehsil_xpath)
vi=8
driver,dropdown_village,total_vill,village_name=village_func(driver,vi,vill_xpath)
while(vi<total_vill):
time.sleep(5)
print("Village Started is"+str(vi))
driver,dropdown_village,total_vill,village_name=village_func(driver,vi,vill_xpath)
ye=3
driver,dropdown_year,total_year,year=year_func(driver,ye,year_xpath)
while(ye<total_year):
time.sleep(5)
print("Year Started is"+str(ye))
driver,dropdown_year,total_year,year=year_func(driver,ye,year_xpath)
ow=2
time.sleep(10)
print("Selected Personal Ownerlist"+str(ow))
driver,dropdown_owner=owner_drop(driver,ow,owner_dropdown_xpath)
name=280
driver,owner_name_drop,total_names,name_of_owner=owner_names_func(driver,name,owner_name_xpath)
while(name<total_names):
print("Names Started is"+str(name))
time.sleep(2)
driver,owner_name_drop,total_names,name_of_owner=owner_names_func(driver,name,owner_name_xpath)
try:
if '?' not in name_of_owner:
print(name_of_owner)
df_owner,driver=dataframe_check(driver,district_name,tehsil_name,village_name,year,name)
driver=select_all(driver,di,ye,te,vi,ow,name)
else:
pass
except:
print("Name is"+str(name))
print("Not Found")
name+=1
1 个回答
2
Selenium在使用时会消耗很多资源,比如打开浏览器、渲染网页的HTML/CSS、执行JavaScript等等。大部分情况下,用Selenium完成的工作其实可以简化为几个HTTP请求,这些请求可以直接用Python来发送,这样就能减少很多不必要的开销。
在网页抓取中,浏览器的开发者工具是你的好帮手。只需稍微查看一下网络选项卡,就能发现网页在每次选择时会向服务器发送POST请求,以获取下一个类别的选项。
它还会向服务器发送一些状态变量。你基本上需要通过requests
模块来模拟这些请求。
下面是一个可以工作的示例,你可以根据自己的需要进行修改:
import requests
from bs4 import BeautifulSoup as BS
URL = "https://jamabandi.nic.in/land%20records/NakalRecord"
def get_aspnet_form(soup: BS):
form = {
"__EVENTARGUMENT": "",
"__LASTFOCUS": "",
"__SCROLLPOSITIONX": "0",
"__SCROLLPOSITIONY": "0",
"ctl00$ContentPlaceHolder1$a": "RdobtnOwner",
}
forms = soup.find("form", attrs={"id": "aspnetForm"})
for i in forms.find_all("input", recursive=False):
form.update({i.attrs["name"]: i.attrs["value"]})
return form
def get_options(soup: BS, type_: str) -> list:
types = {
"district": "ddldname",
"tehsil": "ddltname",
"village": "ddlvname",
"period": "ddlPeriod",
"owner": "ddlOwner",
"record": "ListBox1",
}
ID = "ctl00_ContentPlaceHolder1_%s" % types[type_]
select = soup.find("select", attrs={"id": ID})
result = []
for option in select.find_all("option"):
value = option.attrs["value"]
text = option.text
if value != "-1":
result.append((value, text))
return [result, select.attrs["name"]]
def get_records(soup: BS):
ID = "ctl00_ContentPlaceHolder1_ListBox1"
records = soup.find("select", attrs={"id": ID})
result = []
for record in records.find_all("option"):
name = record.attrs["value"]
if "?" not in name:
result.append(name)
return result
if __name__ == "__main__":
r = requests.get(URL)
soup = BS(r.content, "html.parser")
form = get_aspnet_form(soup)
districts, district_event_target = get_options(soup, "district")
for district_id, district_name in districts:
print("Scraping from district %s:" % district_name)
form["__EVENTTARGET"] = district_event_target
form[district_event_target] = district_id
soup = BS(requests.post(URL, data=form).content, "html.parser")
form = get_aspnet_form(soup)
tehsils, tehsil_event_target = get_options(soup, "tehsil")
for tehsil_id, tehsil_name in tehsils:
print("Scraping from tehsil %s:" % tehsil_name)
form["__EVENTTARGET"] = tehsil_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
soup = BS(requests.post(URL, data=form).content, "html.parser")
form = get_aspnet_form(soup)
villages, village_event_target = get_options(soup, "village")
for village_id, village_name in villages:
print("Scraping from village %s:" % village_name)
form["__EVENTTARGET"] = village_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[village_event_target] = village_id
soup = BS(requests.post(URL, data=form).content, "html.parser")
form = get_aspnet_form(soup)
periods, period_event_target = get_options(soup, "period")
for period_id, period_name in periods:
print("Scraping from period %s:" % period_name)
form["__EVENTTARGET"] = period_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[period_event_target] = period_id
soup = BS(requests.post(URL, data=form).content, "html.parser")
form = get_aspnet_form(soup)
owners, owner_event_target = get_options(soup, "owner")
form["__EVENTTARGET"] = owner_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[period_event_target] = period_id
form[owner_event_target] = "1"
soup = BS(requests.post(URL, data=form).content, "html.parser")
records = get_records(soup)
print(records)
我写了三个工具函数来帮助减少代码的复杂性。以下是代码的主要循环:
- 用
GET
请求获取普通网页。 - 使用HTML解析器(比如
BeautifulSoup
),获取所有的district
选项。 - 现在获取要通过
POST
请求发送的表单变量。 - 遍历每个区,发送
POST
请求以获取与该区相关的网页。 - 在每次循环中重复上述步骤,但这次是针对Tehsils(小区)。
- 遍历Tehsils以获取村庄。
- 遍历村庄以获取时间段。
- 最后,对于每个时间段,只获取
Niji
业主记录。 - 重复以上步骤,直到完成。
注意1:要安装依赖库,请运行pip install requests bs4
。
注意2:作为参考,使用这种方法获取每组记录大约需要500ms
或0.5s
。