摘要:我想迭代一个requests
有效负载,这样我就可以更改每次刮取的登录ID号。在
我用requests&beautiful soup做了一个网页浏览。
要登录到该页面,我需要输入一个唯一的ID号;我有一个这样的号码列表,名为hit_list
。在
对于任何给定的ID号,这个脚本都可以正常工作。但我想做的是让它自动化,这样它就可以贯穿我的整个hit_list
换句话说,我希望num
中的num
为每次迭代更改。目前num
保持不变,并且刮片只是根据hit_list
的长度迭代(即,在这种情况下,相同的刮擦将运行五次)
请注意,我对编码非常陌生,这是我的第一个项目。我知道它可能会有问题,很高兴收到建设性的批评。在
Importing Libraries
import requests
import pymysql.cursors
from pymysql import connect, err, sys, cursors
import sys
import time
import bs4
import time
from datetime import datetime
import openpyxl
#Recording time @ Start
startTime = datetime.now()
print(datetime.now())
#use pymysql to create database- omitted here for parsimony
#This is a sample list, in reality the list will have 100,000 + numbers.
hit_list = [100100403,100100965,100101047,100100874,100100783]
"""
This is my code for importing the real list, included here incase the way the list is imported is relevant to the problem
wb = openpyxl.load_workbook('/Users/Seansmac/Desktop/stage2_trial.xlsx')
sheet= wb.get_sheet_by_name('Sheet1')
type(wb)
#LOUIS: Only importing first twenty (for trial purposes)
for id in range(1,20):
hit_list.append(sheet.cell(row=id, column =1).value)
"""
def web_scrape():
#I'm only creating a function, because I'm told it's always good practice to put any 'bit' of logic into a function- I'm aware this probably looks amateurish.
#Open page
url = 'https://ndber.seai.ie/pass/ber/search.aspx'
with requests.session() as r:
r.headers.update({
'user-agent': 'For more information on this data collection please contact **************************************'
})
for num in hit_list:
#***LOCATION OF THE PROBLEM***
payload_1 = {
'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber':num,
'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search',
'__VIEWSTATE' :'/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFgxmD2QWAgIBD2QWAgIBD2QWAmYPZBYCZg9kFgQCAQ8WAh4JaW5uZXJodG1sZWQCAw9kFgICAg9kFgJmD2QWBAIBD2QWAgIDDw8WCB4EXyFTQgKAAh4MRGVmYXVsdFdpZHRoHB4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBgU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfBxxkZAIEDxQrAAJkEBYAFgAWABYCZg9kFgICAg9kFgJmDzwrABECARAWABYAFgAMFCsAAGQCBg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCDA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQYAQUzY3RsMDAkRGVmYXVsdENvbnRlbnQkQkVSU2VhcmNoJGdyaWRSYXRpbmdzJGdyaWR2aWV3D2dkrGhAYkdLuZZh8E98usAnWAaRMxurQ1Gquc+9krb7Boc=',
}
r.post(url, data=payload_1)
#click intermediate page
payload_2 = {
'__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
'__VIEWSTATE': "/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFg5mD2QWAgIBDxYCHwJoFgICAQ8PFgIfAmhkFgJmD2QWAmYPZBYEAgEPFgIeCWlubmVyaHRtbGVkAgMPZBYCAgIPZBYCZg9kFgQCAQ9kFgICAw8PFgoeBF8hU0ICgAIeDERlZmF1bHRXaWR0aBweBFRleHQFCTEwMDEwMDMxMh4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBwU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfCBxkZAICDw8WAh8CZ2QWAmYPZBYCZg9kFgICAw9kFgJmD2QWAmYPZBYCAgEPZBYCZg9kFgJmD2QWAgIBDxYCHwMFDlNlYXJjaCBSZXN1bHRzZAIEDxQrAAIPFgYfAmceElNlbGVjdGVkUm93SW5kZXhlczLNAQABAAAA/////wEAAAAAAAAABAEAAAB+U3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tTeXN0ZW0uSW50MzIsIG1zY29ybGliLCBWZXJzaW9uPTQuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49Yjc3YTVjNTYxOTM0ZTA4OV1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24HAAAICAgJAgAAAAAAAAABAAAADwIAAAAAAAAACAseCmVkaXRfc3R5bGULKXNWMS5ORVQuV2ViQ29udHJvbHMuRWRpdFN0eWxlLCBWMS5ORVQuV2ViQ29udHJvbHMsIFZlcnNpb249MS40LjAuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj01YmYzNDU3ZDMwODk1MjEzAmQQFgAWABYAFgJmD2QWAgICD2QWAmYPPCsAEQMADxYEHgtfIURhdGFCb3VuZGceC18hSXRlbUNvdW50AgFkARAWABYAFgAMFCsAABYCZg9kFgICAQ9kFgpmD2QWAgIBDw8WBB4PQ29tbWFuZEFyZ3VtZW50BQkxMDAxMDAzMTIfBgUJMTAwMTAwMzEyZGQCAQ9kFgJmDw8WAh8GBQNCRVJkZAICD2QWAmYPDxYCHwYFCzEwMDE1MTAwMDkwZGQCAw9kFgJmDw8WAh8GBQowNy0wMS0yMDA5ZGQCBA9kFgJmDw8WAh8GBSQzMCBNQVJJTkUgVklFVw1BVEhMT05FDUNPLiBXRVNUTUVBVEhkZAIGDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIIDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIKDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIMDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZBgBBTNjdGwwMCREZWZhdWx0Q29udGVudCRCRVJTZWFyY2gkZ3JpZFJhdGluZ3MkZ3JpZHZpZXcPPCsADAEIAgFkjLH/5QxuANxuCh3kAmhUU/4/OZj+wy8nJDYIFx4Lowo=",
'__VIEWSTATEGENERATOR':"1F9CCB97",
'__EVENTVALIDATION': "/wEdAAbaTEcivWuxiWecwu4mVYO9eUnQmzIzqu4hlt+kSDcrOBWCa0ezllZh+jGXjO1EB1dmMORt6G1O0Qbn0WLg3p+rPmLeN6mjN7eq7JtUZMjpL2DXqeB/GqPe7AFtNDKiJkEPdN6Y/vq7o/49hX+o366Ioav3zEBl37yPlq3sYQBXpQ==",
}
s=r.post(url, data=payload_2)
#scrape the page
soup = bs4.BeautifulSoup(s.content, 'html.parser')
"""
FOR THE PURPOSES OF MY ISSUE EVERYTHING BELOW WORKS FINE & CAN BE SKIPPED
"""
print('\nBEGINNING SCRAPE....')
# First Section
ber_dec = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsBER'})
#Address- clean scrape
address = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress'})
address = (address.get_text(',').strip())
print('address:', address)
#Date of Issue- clean scrape
date_issue1 = ber_dec.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfBER_container_DateOfIssue'})
date_issue = date_issue1.find('div', {'class':'formControlReadonly'})
date_issue = (date_issue.get_text().strip())
print('date_of_issue:',date_issue)
#MPRN -Clean scrape
MPRN1 = ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_MPRN'})
MPRN = MPRN1.find('div',{'class':'formControlReadonly'})
MPRN = MPRN.get_text().strip()
print('MPRN:', MPRN)
#Emissions Indicator- clean scrape
emissions_indicator1 = ber_dec.find('div',{'id':'ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue'})
emissions_indicator_bunched = emissions_indicator1.get_text().strip()
print('\n\nem_bunched:',emissions_indicator_bunched)
emissions_indicator, emissions_indicator_unit = emissions_indicator_bunched.split()
print('emissions_indicator:',emissions_indicator)
emissions_indicator_unit= emissions_indicator_unit.replace("(","")
emissions_indicator_unit=emissions_indicator_unit.replace(")","")
print('emissions_indicator_unit:',emissions_indicator_unit)
#BER Score- clean scrape
BER_bunched = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating'})
BER_bunched =(BER_bunched.get_text().strip())
print ('\n \nBER_bunched:', BER_bunched)
BER_score, BER_actual_rating, BER_unit = BER_bunched.split()
print('\nBER_score:',BER_score)
print('\nBER_actual_rating:',BER_actual_rating)
BER_unit = BER_unit.replace("(", " ")
BER_unit = BER_unit.replace(")","")
print('\nClean_BER_unit:',BER_unit )
#Type of Rating- clean scrape
type_of_rating1= ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_TypeOfRating'})
type_of_rating= type_of_rating1.find('div',{'class':'formControlReadonly'})
type_of_rating = type_of_rating.get_text().strip()
print('type_of_rating:',type_of_rating )
# Second Section
dwelling_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsStructure'})
#Dwelling Type- clean scrape
dwelling_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DwellingType'})
dwelling_type = dwelling_type1.find('div',{'class':'formControlReadonly'})
dwelling_type = dwelling_type.get_text().strip()
print ('Dwelling Type:', dwelling_type)
#Number of Stories- clean scrape
num_stories1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_NoStoresy'})
num_stories = num_stories1.find('div',{'class':'formControlReadonly'})
num_stories = num_stories.get_text().strip()
print('Number of Stories:', num_stories)
#Year of Construction- clean scrape
yr_construction1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DateOfConstruction'})
yr_construction = yr_construction1.find('div',{'class':'formControlReadonly'})
yr_construction = yr_construction.get_text().strip()
print('Year of Construction:', yr_construction)
#Floor Area- clean scrape
floor_area= dwelling_details.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_div_FloorArea'})
floor_area = floor_area.get_text().strip()
floor_area, floor_area_unit =floor_area.split()
floor_area_unit = floor_area_unit.replace("(","")
floor_area_unit=floor_area_unit.replace(")","")
print('\nFloor Area:', floor_area)
print('floor_area_unit:', floor_area_unit)
#Wall Type- clean scrape
wall_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_WallType'})
wall_type = wall_type1.find('div',{'class':'formControlReadonly'})
wall_type= wall_type.get_text().strip()
print('Wall Type:', wall_type)
#Glazing Type- clean scrape
glazing_type1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_GlazingType'})
glazing_type =glazing_type1.find('div',{'class':'formControlReadonly'})
glazing_type = glazing_type.get_text().strip()
print('Glazing Type:', glazing_type)
#Percent Low Energy Lighting- clean scrape
percent_low_energy_lighting1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_PercentLowEnergyLight'})
percent_low_energy_lighting = percent_low_energy_lighting1.find('div',{'class':'formControlReadonly'})
percent_low_energy_lighting = percent_low_energy_lighting.get_text().strip()
print('% Low Energy Lighting:', percent_low_energy_lighting)
#Space Heating Fuel- clean scrape
space_heating_fuel1 =dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingFuel'})
space_heating_fuel =space_heating_fuel1.find('div',{'class':'formControlReadonly'})
space_heating_fuel = space_heating_fuel.get_text().strip()
print('Space Heating Fuel:',space_heating_fuel)
#Space Heating Efficiency- clean scrape
space_heating_efficiency1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingEfficiency'})
space_heating_efficiency = space_heating_efficiency1.find('div',{'class':'formControlReadonly'})
space_heating_efficiency= space_heating_efficiency.get_text().strip()
print('Space Heating Efficiency:', space_heating_efficiency)
#Water Heatng Fuel- clean scrape
water_heating_fuel1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingFuel'})
water_heating_fuel =water_heating_fuel1.find('div',{'class':'formControlReadonly'})
water_heating_fuel = water_heating_fuel.get_text().strip()
print('Water Heating Fuel:', water_heating_fuel)
#Water Heating Efficiency- clean scrape
water_heating_efficiency1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingEfficiency'})
water_heating_efficiency =water_heating_efficiency1.find('div',{'class':'formControlReadonly'})
water_heating_efficiency= water_heating_efficiency.get_text().strip()
print('Water Heating Efficiency:', water_heating_efficiency)
#thrid section
assessor_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsAssessor'})
#Assessor Number- clean scrape
assessor_num1 = assessor_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfAssessor_container_AssessorNumber'})
assessor_num = assessor_num1.find('div',{'class':'formControlReadonly'})
assessor_num= assessor_num.get_text().strip()
print('Assessor Number:', assessor_num)
print('BER:', num)
print('\***************nSCRAPE FINISHED***************\n')
#Populate datebase
print('\nRECONNECTING WITH DATABASE')
with connection.cursor() as cursor:
print('SUCCESSFUL CONNECTION')
sql =("INSERT INTO table1(BER_number, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
cursor.execute(sql, (num, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating))
print('ROW POPULATED')
#Calling the function
web_scrape()
#Metadata
print('Gathering Details...')
Run_time = datetime.now() - startTime
print('Run Time:', Run_time)
#Loop Finished
print('\n***************PROGRAMME FINISHED***************')
@padraickunningham提供了这个答案的大部分逻辑,但是正如我在他的答案下面的评论所描述的,他的解决方案只让我走了一半。
我已经能够在他的工作基础上解决这个问题。
还有一个步骤需要完成,那就是“点击”一个中介页面,这将导致我想要获取的数据所在的位置。在
提前为我的不标准标签和格式道歉。我是初学者。在
您需要获得新的
__EVENTVALIDATION
令牌等。。。对于每个帖子,您不能只从浏览器中复制值并将其硬编码到帖子数据中:如果我们运行代码并分析一些返回的内容,您可以看到我们正确地获取了每个页面:
^{pr2}$这给了你所有的信息,从表格栏的BER证书编号,你已经有了,所以你不需要担心它。在
当您发现您只需要将数据从第一个post返回的内容传递到第二个有效负载时,如果您将逻辑封装在函数中,那么您的代码也会更易于管理:
我没有解析所有数据,但其余数据的逻辑是相同的,打印解析后返回的dict将为您提供:
相关问题 更多 >
编程相关推荐