为什么我不能在 `requests` 中循环遍历 `payload` 来迭代我的网络爬虫？

Importing Libraries import requests import pymysql.cursors from pymysql import connect, err, sys, cursors import sys import time import bs4 import time from datetime import datetime import openpyxl #Recording time @ Start startTime = datetime.now() print(datetime.now()) #use pymysql to create database- omitted here for parsimony #This is a sample list, in reality the list will have 100,000 + numbers. hit_list = [100100403,100100965,100101047,100100874,100100783] """ This is my code for importing the real list, included here incase the way the list is imported is relevant to the problem wb = openpyxl.load_workbook('/Users/Seansmac/Desktop/stage2_trial.xlsx') sheet= wb.get_sheet_by_name('Sheet1') type(wb) #LOUIS: Only importing first twenty (for trial purposes) for id in range(1,20): hit_list.append(sheet.cell(row=id, column =1).value) """ def web_scrape(): #I'm only creating a function, because I'm told it's always good practice to put any 'bit' of logic into a function- I'm aware this probably looks amateurish. #Open page url = 'https://ndber.seai.ie/pass/ber/search.aspx' with requests.session() as r: r.headers.update({ 'user-agent': 'For more information on this data collection please contact **************************************' }) for num in hit_list: #***LOCATION OF THE PROBLEM*** payload_1 = { 'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber':num, 'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search', '__VIEWSTATE' :'/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFgxmD2QWAgIBD2QWAgIBD2QWAmYPZBYCZg9kFgQCAQ8WAh4JaW5uZXJodG1sZWQCAw9kFgICAg9kFgJmD2QWBAIBD2QWAgIDDw8WCB4EXyFTQgKAAh4MRGVmYXVsdFdpZHRoHB4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBgU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfBxxkZAIEDxQrAAJkEBYAFgAWABYCZg9kFgICAg9kFgJmDzwrABECARAWABYAFgAMFCsAAGQCBg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCDA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQYAQUzY3RsMDAkRGVmYXVsdENvbnRlbnQkQkVSU2VhcmNoJGdyaWRSYXRpbmdzJGdyaWR2aWV3D2dkrGhAYkdLuZZh8E98usAnWAaRMxurQ1Gquc+9krb7Boc=', } r.post(url, data=payload_1) #click intermediate page payload_2 = { '__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails', '__VIEWSTATE': "/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFg5mD2QWAgIBDxYCHwJoFgICAQ8PFgIfAmhkFgJmD2QWAmYPZBYEAgEPFgIeCWlubmVyaHRtbGVkAgMPZBYCAgIPZBYCZg9kFgQCAQ9kFgICAw8PFgoeBF8hU0ICgAIeDERlZmF1bHRXaWR0aBweBFRleHQFCTEwMDEwMDMxMh4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBwU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfCBxkZAICDw8WAh8CZ2QWAmYPZBYCZg9kFgICAw9kFgJmD2QWAmYPZBYCAgEPZBYCZg9kFgJmD2QWAgIBDxYCHwMFDlNlYXJjaCBSZXN1bHRzZAIEDxQrAAIPFgYfAmceElNlbGVjdGVkUm93SW5kZXhlczLNAQABAAAA/////wEAAAAAAAAABAEAAAB+U3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tTeXN0ZW0uSW50MzIsIG1zY29ybGliLCBWZXJzaW9uPTQuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49Yjc3YTVjNTYxOTM0ZTA4OV1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24HAAAICAgJAgAAAAAAAAABAAAADwIAAAAAAAAACAseCmVkaXRfc3R5bGULKXNWMS5ORVQuV2ViQ29udHJvbHMuRWRpdFN0eWxlLCBWMS5ORVQuV2ViQ29udHJvbHMsIFZlcnNpb249MS40LjAuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj01YmYzNDU3ZDMwODk1MjEzAmQQFgAWABYAFgJmD2QWAgICD2QWAmYPPCsAEQMADxYEHgtfIURhdGFCb3VuZGceC18hSXRlbUNvdW50AgFkARAWABYAFgAMFCsAABYCZg9kFgICAQ9kFgpmD2QWAgIBDw8WBB4PQ29tbWFuZEFyZ3VtZW50BQkxMDAxMDAzMTIfBgUJMTAwMTAwMzEyZGQCAQ9kFgJmDw8WAh8GBQNCRVJkZAICD2QWAmYPDxYCHwYFCzEwMDE1MTAwMDkwZGQCAw9kFgJmDw8WAh8GBQowNy0wMS0yMDA5ZGQCBA9kFgJmDw8WAh8GBSQzMCBNQVJJTkUgVklFVw1BVEhMT05FDUNPLiBXRVNUTUVBVEhkZAIGDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIIDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIKDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIMDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZBgBBTNjdGwwMCREZWZhdWx0Q29udGVudCRCRVJTZWFyY2gkZ3JpZFJhdGluZ3MkZ3JpZHZpZXcPPCsADAEIAgFkjLH/5QxuANxuCh3kAmhUU/4/OZj+wy8nJDYIFx4Lowo=", '__VIEWSTATEGENERATOR':"1F9CCB97", '__EVENTVALIDATION': "/wEdAAbaTEcivWuxiWecwu4mVYO9eUnQmzIzqu4hlt+kSDcrOBWCa0ezllZh+jGXjO1EB1dmMORt6G1O0Qbn0WLg3p+rPmLeN6mjN7eq7JtUZMjpL2DXqeB/GqPe7AFtNDKiJkEPdN6Y/vq7o/49hX+o366Ioav3zEBl37yPlq3sYQBXpQ==", } s=r.post(url, data=payload_2) #scrape the page soup = bs4.BeautifulSoup(s.content, 'html.parser') """ FOR THE PURPOSES OF MY ISSUE EVERYTHING BELOW WORKS FINE & CAN BE SKIPPED """ print('\nBEGINNING SCRAPE....') # First Section ber_dec = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsBER'}) #Address- clean scrape address = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress'}) address = (address.get_text(',').strip()) print('address:', address) #Date of Issue- clean scrape date_issue1 = ber_dec.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfBER_container_DateOfIssue'}) date_issue = date_issue1.find('div', {'class':'formControlReadonly'}) date_issue = (date_issue.get_text().strip()) print('date_of_issue:',date_issue) #MPRN -Clean scrape MPRN1 = ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_MPRN'}) MPRN = MPRN1.find('div',{'class':'formControlReadonly'}) MPRN = MPRN.get_text().strip() print('MPRN:', MPRN) #Emissions Indicator- clean scrape emissions_indicator1 = ber_dec.find('div',{'id':'ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue'}) emissions_indicator_bunched = emissions_indicator1.get_text().strip() print('\n\nem_bunched:',emissions_indicator_bunched) emissions_indicator, emissions_indicator_unit = emissions_indicator_bunched.split() print('emissions_indicator:',emissions_indicator) emissions_indicator_unit= emissions_indicator_unit.replace("(","") emissions_indicator_unit=emissions_indicator_unit.replace(")","") print('emissions_indicator_unit:',emissions_indicator_unit) #BER Score- clean scrape BER_bunched = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating'}) BER_bunched =(BER_bunched.get_text().strip()) print ('\n \nBER_bunched:', BER_bunched) BER_score, BER_actual_rating, BER_unit = BER_bunched.split() print('\nBER_score:',BER_score) print('\nBER_actual_rating:',BER_actual_rating) BER_unit = BER_unit.replace("(", " ") BER_unit = BER_unit.replace(")","") print('\nClean_BER_unit:',BER_unit ) #Type of Rating- clean scrape type_of_rating1= ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_TypeOfRating'}) type_of_rating= type_of_rating1.find('div',{'class':'formControlReadonly'}) type_of_rating = type_of_rating.get_text().strip() print('type_of_rating:',type_of_rating ) # Second Section dwelling_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsStructure'}) #Dwelling Type- clean scrape dwelling_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DwellingType'}) dwelling_type = dwelling_type1.find('div',{'class':'formControlReadonly'}) dwelling_type = dwelling_type.get_text().strip() print ('Dwelling Type:', dwelling_type) #Number of Stories- clean scrape num_stories1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_NoStoresy'}) num_stories = num_stories1.find('div',{'class':'formControlReadonly'}) num_stories = num_stories.get_text().strip() print('Number of Stories:', num_stories) #Year of Construction- clean scrape yr_construction1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DateOfConstruction'}) yr_construction = yr_construction1.find('div',{'class':'formControlReadonly'}) yr_construction = yr_construction.get_text().strip() print('Year of Construction:', yr_construction) #Floor Area- clean scrape floor_area= dwelling_details.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_div_FloorArea'}) floor_area = floor_area.get_text().strip() floor_area, floor_area_unit =floor_area.split() floor_area_unit = floor_area_unit.replace("(","") floor_area_unit=floor_area_unit.replace(")","") print('\nFloor Area:', floor_area) print('floor_area_unit:', floor_area_unit) #Wall Type- clean scrape wall_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_WallType'}) wall_type = wall_type1.find('div',{'class':'formControlReadonly'}) wall_type= wall_type.get_text().strip() print('Wall Type:', wall_type) #Glazing Type- clean scrape glazing_type1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_GlazingType'}) glazing_type =glazing_type1.find('div',{'class':'formControlReadonly'}) glazing_type = glazing_type.get_text().strip() print('Glazing Type:', glazing_type) #Percent Low Energy Lighting- clean scrape percent_low_energy_lighting1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_PercentLowEnergyLight'}) percent_low_energy_lighting = percent_low_energy_lighting1.find('div',{'class':'formControlReadonly'}) percent_low_energy_lighting = percent_low_energy_lighting.get_text().strip() print('% Low Energy Lighting:', percent_low_energy_lighting) #Space Heating Fuel- clean scrape space_heating_fuel1 =dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingFuel'}) space_heating_fuel =space_heating_fuel1.find('div',{'class':'formControlReadonly'}) space_heating_fuel = space_heating_fuel.get_text().strip() print('Space Heating Fuel:',space_heating_fuel) #Space Heating Efficiency- clean scrape space_heating_efficiency1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingEfficiency'}) space_heating_efficiency = space_heating_efficiency1.find('div',{'class':'formControlReadonly'}) space_heating_efficiency= space_heating_efficiency.get_text().strip() print('Space Heating Efficiency:', space_heating_efficiency) #Water Heatng Fuel- clean scrape water_heating_fuel1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingFuel'}) water_heating_fuel =water_heating_fuel1.find('div',{'class':'formControlReadonly'}) water_heating_fuel = water_heating_fuel.get_text().strip() print('Water Heating Fuel:', water_heating_fuel) #Water Heating Efficiency- clean scrape water_heating_efficiency1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingEfficiency'}) water_heating_efficiency =water_heating_efficiency1.find('div',{'class':'formControlReadonly'}) water_heating_efficiency= water_heating_efficiency.get_text().strip() print('Water Heating Efficiency:', water_heating_efficiency) #thrid section assessor_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsAssessor'}) #Assessor Number- clean scrape assessor_num1 = assessor_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfAssessor_container_AssessorNumber'}) assessor_num = assessor_num1.find('div',{'class':'formControlReadonly'}) assessor_num= assessor_num.get_text().strip() print('Assessor Number:', assessor_num) print('BER:', num) print('\***************nSCRAPE FINISHED***************\n') #Populate datebase print('\nRECONNECTING WITH DATABASE') with connection.cursor() as cursor: print('SUCCESSFUL CONNECTION') sql =("INSERT INTO table1(BER_number, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") cursor.execute(sql, (num, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating)) print('ROW POPULATED') #Calling the function web_scrape() #Metadata print('Gathering Details...') Run_time = datetime.now() - startTime print('Run Time:', Run_time) #Loop Finished print('\n***************PROGRAMME FINISHED***************')

2条回答

网友

1楼 · 编辑于 2024-06-11 21:39:34

@padraickunningham提供了这个答案的大部分逻辑，但是正如我在他的答案下面的评论所描述的，他的解决方案只让我走了一半。
我已经能够在他的工作基础上解决这个问题。
还有一个步骤需要完成，那就是“点击”一个中介页面，这将导致我想要获取的数据所在的位置。在

提前为我的不标准标签和格式道歉。我是初学者。在

import requests
import pymysql.cursors
from pymysql import connect, err, sys, cursors
import sys
import time
import bs4
import time
from datetime import datetime
import openpyxl

hit_list = [100100403,100100965,100101047,100100874,100100783] #this is a sample list
#Open page 
url = 'https://ndber.seai.ie/pass/ber/search.aspx'


def field_update(s):
    soup = bs4.BeautifulSoup(s.get(url).content,"html.parser")
    return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
    "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")    ["value"],
    "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
    print('field updated')

with requests.session() as s:
    for ber in hit_list:
        payload_1 = {
            'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': ber,
            'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
# update the post data with new token values
    payload_1.update(field_update(s))
    r = s.post(url, data=payload_1)

#'click through' intermediate page
#THIS IS THE ADDITIONAL CODE THAT BUILDS ON PADRAIC'S ANSWER
    soup = bs4.BeautifulSoup(r.content,"html.parser")
    stage_two= {
        "__EVENTTARGET": 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
        "__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
        "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
        "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}        

    q=s.post(url, data=stage_two)
    print('payload_2 posted')    
    soup = bs4.BeautifulSoup(q.content, 'html.parser')


    print('\nBEGINNING SCRAPE....')
    #FOR DATA TO BE SCRAPED, SEE ORIGINAL QUESTION

网友

2楼 · 编辑于 2024-06-11 21:39:34

您需要获得新的__EVENTVALIDATION令牌等。。。对于每个帖子，您不能只从浏览器中复制值并将其硬编码到帖子数据中：

import requests

url = 'https://ndber.seai.ie/pass/ber/search.aspx'
hit_list = [100100403, 100100965, 100101047, 100100874, 100100783]
h = {}


def renew(s):
    soup = BeautifulSoup(s.get(url).content,"html.parser.)
    return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
            "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
            "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}


with requests.session() as s:
    for num in hit_list:
        payload_1 = {
            'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': num,
            'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
        # update the post data with new token values
        payload_1.update(renew(s))
        r = s.post(url, data=payload_1)

        # scrape the page
        soup = BeautifulSoup(r.content, 'html.parser')

如果我们运行代码并分析一些返回的内容，您可以看到我们正确地获取了每个页面：

^{pr2}$

这给了你所有的信息，从表格栏的BER证书编号，你已经有了，所以你不需要担心它。在

当您发现您只需要将数据从第一个post返回的内容传递到第二个有效负载时，如果您将逻辑封装在函数中，那么您的代码也会更易于管理：

def renew(soup):
    return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
            "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
            "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}


def parse_data(soup):
    address = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress").text.strip()
    MPRN = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_container_MPRN div.formControlReadonly").text.strip()
    emissions_indicator, emissions_indicator_unit = soup.select_one(
        "#ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue").text.split()
    emissions_indicator_unit = emissions_indicator_unit.strip("()")
    BER_score, BER_actual_rating, BER_unit = soup.select_one(
        "#ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating").text.split()
    BER_unit = BER_unit.strip("()")
    return {"MPRN": MPRN, "emissions_indicator": emissions_indicator,
            "emissions_indicator_unit": emissions_indicator_unit,
            "BER_score": BER_score, "BER_actual_rating": BER_actual_rating,
            "BER_unit": BER_unit, "address": address}

def submint_to_db(dct):
    with connection.cursor() as cursor:
        print('SUCCESSFUL CONNECTION')
        sql = "INSERT INTO table1 ( %s ) VALUES ( %s )" % (",".join(dct),  ', '.join(['%s'] * len(dct)))
        cursor.execute(sql, dct.values())

payload_1 = {
    'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
payload_2 = {
    '__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
}

with requests.session() as s:
    tokens = renew(BeautifulSoup(requests.get(url).content, "html.parser"))
    for num in hit_list:
        # update the post data with new token values
        payload_1['ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber'] = num
        payload_1.update(tokens)
        r = s.post(url, data=payload_1)
        tokens2 = renew(BeautifulSoup(r.content, 'html.parser'))
        payload_2.update(tokens2)
        soup = BeautifulSoup(requests.post(url, data=payload_2).content, "html.parser")
        submint_to_db(parse_data(soup))

我没有解析所有数据，但其余数据的逻辑是相同的，打印解析后返回的dict将为您提供：

{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '57.83', 'address': '24 CLONEE COURTMAIN STREETCLONEECO. MEATH', 'BER_score': 'D1', 'BER_actual_rating': '235.54', 'MPRN': '10003467711'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '42.4', 'address': '19 GORTANORADINGLECO. KERRY', 'BER_score': 'C1', 'BER_actual_rating': '165.79', 'MPRN': '10301654014'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '34.03', 'address': '8 CANNON PLACE1 HERBERT ROADDUBLIN 4', 'BER_score': 'C2', 'BER_actual_rating': '175.32', 'MPRN': '10002082335'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '53.51', 'address': '12 GORTANORADINGLECO. KERRY', 'BER_score': 'C3', 'BER_actual_rating': '208.45', 'MPRN': '10301653940'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '121.54', 'address': '13 RENMORE ROADGALWAY CITY', 'BER_score': 'G', 'BER_actual_rating': '472.19', 'MPRN': '10010500405'}

相关问题更多 >

编程相关推荐

热门问题

热门文章