2016-05-22 27 views
3

摘要:我想遍歷requests有效負載,以便我可以更改每個刮擦的登錄ID號。爲什麼我不能通過`requests`中的`payload'來遍歷我的網頁?

我正在使用請求&美麗的湯做網絡刮。 要登錄到頁面,我需要輸入唯一的ID號;我有一個這樣的號碼列表,名爲hit_list

對於任何給定的ID號碼,這個腳本工作得很好。但我想要做的是自動化,以便它貫穿了我的整個hit_list

換句話說,我想在payload_1num來更改每個迭代。目前num保持不變和刮只是迭代根據hit_list長度(即,在這種情況下,同樣的刮將運行五次)

請注意,我是很新的編碼,這是我的第一個項目。我意識到它可能存在問題,並且很樂意接受建設性的批評。

Importing Libraries 
import requests 
import pymysql.cursors 
from pymysql import connect, err, sys, cursors 
import sys 
import time 
import bs4 
import time 
from datetime import datetime 
import openpyxl 


#Recording time @ Start 
startTime = datetime.now() 
print(datetime.now()) 

#use pymysql to create database- omitted here for parsimony 

#This is a sample list, in reality the list will have 100,000 + numbers. 
hit_list = [100100403,100100965,100101047,100100874,100100783] 

""" 
This is my code for importing the real list, included here incase the way the list is imported is relevant to the problem 
wb = openpyxl.load_workbook('/Users/Seansmac/Desktop/stage2_trial.xlsx') 
sheet= wb.get_sheet_by_name('Sheet1') 
type(wb) 
#LOUIS: Only importing first twenty (for trial purposes) 
for id in range(1,20): 
    hit_list.append(sheet.cell(row=id, column =1).value) 
""" 

def web_scrape(): 
#I'm only creating a function, because I'm told it's always good practice to put any 'bit' of logic into a function- I'm aware this probably looks amateurish. 
#Open page 
    url = 'https://ndber.seai.ie/pass/ber/search.aspx' 

with requests.session() as r: 
     r.headers.update({ 
    'user-agent': 'For more information on this data collection please contact **************************************' 
})  

    for num in hit_list: 
     #***LOCATION OF THE PROBLEM*** 
     payload_1 = { 
       'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber':num, 
       'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search', 
       '__VIEWSTATE' :'/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFgxmD2QWAgIBD2QWAgIBD2QWAmYPZBYCZg9kFgQCAQ8WAh4JaW5uZXJodG1sZWQCAw9kFgICAg9kFgJmD2QWBAIBD2QWAgIDDw8WCB4EXyFTQgKAAh4MRGVmYXVsdFdpZHRoHB4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBgU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfBxxkZAIEDxQrAAJkEBYAFgAWABYCZg9kFgICAg9kFgJmDzwrABECARAWABYAFgAMFCsAAGQCBg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCDA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQYAQUzY3RsMDAkRGVmYXVsdENvbnRlbnQkQkVSU2VhcmNoJGdyaWRSYXRpbmdzJGdyaWR2aWV3D2dkrGhAYkdLuZZh8E98usAnWAaRMxurQ1Gquc+9krb7Boc=', 
      }    
      r.post(url, data=payload_1)    
#click intermediate page  
      payload_2 = { 
        '__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails', 
        '__VIEWSTATE': "/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFg5mD2QWAgIBDxYCHwJoFgICAQ8PFgIfAmhkFgJmD2QWAmYPZBYEAgEPFgIeCWlubmVyaHRtbGVkAgMPZBYCAgIPZBYCZg9kFgQCAQ9kFgICAw8PFgoeBF8hU0ICgAIeDERlZmF1bHRXaWR0aBweBFRleHQFCTEwMDEwMDMxMh4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBwU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfCBxkZAICDw8WAh8CZ2QWAmYPZBYCZg9kFgICAw9kFgJmD2QWAmYPZBYCAgEPZBYCZg9kFgJmD2QWAgIBDxYCHwMFDlNlYXJjaCBSZXN1bHRzZAIEDxQrAAIPFgYfAmceElNlbGVjdGVkUm93SW5kZXhlczLNAQABAAAA/////wEAAAAAAAAABAEAAAB+U3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tTeXN0ZW0uSW50MzIsIG1zY29ybGliLCBWZXJzaW9uPTQuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49Yjc3YTVjNTYxOTM0ZTA4OV1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24HAAAICAgJAgAAAAAAAAABAAAADwIAAAAAAAAACAseCmVkaXRfc3R5bGULKXNWMS5ORVQuV2ViQ29udHJvbHMuRWRpdFN0eWxlLCBWMS5ORVQuV2ViQ29udHJvbHMsIFZlcnNpb249MS40LjAuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj01YmYzNDU3ZDMwODk1MjEzAmQQFgAWABYAFgJmD2QWAgICD2QWAmYPPCsAEQMADxYEHgtfIURhdGFCb3VuZGceC18hSXRlbUNvdW50AgFkARAWABYAFgAMFCsAABYCZg9kFgICAQ9kFgpmD2QWAgIBDw8WBB4PQ29tbWFuZEFyZ3VtZW50BQkxMDAxMDAzMTIfBgUJMTAwMTAwMzEyZGQCAQ9kFgJmDw8WAh8GBQNCRVJkZAICD2QWAmYPDxYCHwYFCzEwMDE1MTAwMDkwZGQCAw9kFgJmDw8WAh8GBQowNy0wMS0yMDA5ZGQCBA9kFgJmDw8WAh8GBSQzMCBNQVJJTkUgVklFVw1BVEhMT05FDUNPLiBXRVNUTUVBVEhkZAIGDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIIDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIKDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIMDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZBgBBTNjdGwwMCREZWZhdWx0Q29udGVudCRCRVJTZWFyY2gkZ3JpZFJhdGluZ3MkZ3JpZHZpZXcPPCsADAEIAgFkjLH/5QxuANxuCh3kAmhUU/4/OZj+wy8nJDYIFx4Lowo=", 
        '__VIEWSTATEGENERATOR':"1F9CCB97",     
        '__EVENTVALIDATION': "/wEdAAbaTEcivWuxiWecwu4mVYO9eUnQmzIzqu4hlt+kSDcrOBWCa0ezllZh+jGXjO1EB1dmMORt6G1O0Qbn0WLg3p+rPmLeN6mjN7eq7JtUZMjpL2DXqeB/GqPe7AFtNDKiJkEPdN6Y/vq7o/49hX+o366Ioav3zEBl37yPlq3sYQBXpQ==", 
       }    
      s=r.post(url, data=payload_2)   
#scrape the page  
      soup = bs4.BeautifulSoup(s.content, 'html.parser') 

""" 

FOR THE PURPOSES OF MY ISSUE EVERYTHING BELOW WORKS FINE & CAN BE SKIPPED 

""" 

print('\nBEGINNING SCRAPE....')     
# First Section      
      ber_dec = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsBER'})    
#Address- clean scrape 
      address = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress'}) 
      address = (address.get_text(',').strip()) 
      print('address:', address)    
#Date of Issue- clean scrape 
      date_issue1 = ber_dec.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfBER_container_DateOfIssue'}) 
      date_issue = date_issue1.find('div', {'class':'formControlReadonly'})   
      date_issue = (date_issue.get_text().strip()) 
      print('date_of_issue:',date_issue)    
#MPRN -Clean scrape 
      MPRN1 = ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_MPRN'}) 
      MPRN = MPRN1.find('div',{'class':'formControlReadonly'}) 
      MPRN = MPRN.get_text().strip() 
      print('MPRN:', MPRN)    
#Emissions Indicator- clean scrape 
      emissions_indicator1 = ber_dec.find('div',{'id':'ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue'}) 
      emissions_indicator_bunched = emissions_indicator1.get_text().strip()    
      print('\n\nem_bunched:',emissions_indicator_bunched)   
      emissions_indicator, emissions_indicator_unit = emissions_indicator_bunched.split() 
      print('emissions_indicator:',emissions_indicator)  
      emissions_indicator_unit= emissions_indicator_unit.replace("(","") 
      emissions_indicator_unit=emissions_indicator_unit.replace(")","") 
      print('emissions_indicator_unit:',emissions_indicator_unit)    

      #BER Score- clean scrape  
      BER_bunched = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating'}) 
      BER_bunched =(BER_bunched.get_text().strip()) 
      print ('\n \nBER_bunched:', BER_bunched)     
      BER_score, BER_actual_rating, BER_unit = BER_bunched.split()  
      print('\nBER_score:',BER_score) 
      print('\nBER_actual_rating:',BER_actual_rating) 
      BER_unit = BER_unit.replace("(", " ") 
      BER_unit = BER_unit.replace(")","") 
      print('\nClean_BER_unit:',BER_unit) 

      #Type of Rating- clean scrape 
      type_of_rating1= ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_TypeOfRating'}) 
      type_of_rating= type_of_rating1.find('div',{'class':'formControlReadonly'}) 
      type_of_rating = type_of_rating.get_text().strip() 
      print('type_of_rating:',type_of_rating) 


      # Second Section 

      dwelling_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsStructure'}) 

      #Dwelling Type- clean scrape 
      dwelling_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DwellingType'}) 
      dwelling_type = dwelling_type1.find('div',{'class':'formControlReadonly'}) 
      dwelling_type = dwelling_type.get_text().strip() 
      print ('Dwelling Type:', dwelling_type)  

      #Number of Stories- clean scrape 
      num_stories1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_NoStoresy'}) 
      num_stories = num_stories1.find('div',{'class':'formControlReadonly'}) 
      num_stories = num_stories.get_text().strip() 
      print('Number of Stories:', num_stories) 

      #Year of Construction- clean scrape 
      yr_construction1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DateOfConstruction'}) 
      yr_construction = yr_construction1.find('div',{'class':'formControlReadonly'})  
      yr_construction = yr_construction.get_text().strip() 
      print('Year of Construction:', yr_construction)    

      #Floor Area- clean scrape 
      floor_area= dwelling_details.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_div_FloorArea'}) 
      floor_area = floor_area.get_text().strip() 
      floor_area, floor_area_unit =floor_area.split() 
      floor_area_unit = floor_area_unit.replace("(","") 
      floor_area_unit=floor_area_unit.replace(")","") 
      print('\nFloor Area:', floor_area) 
      print('floor_area_unit:', floor_area_unit) 

      #Wall Type- clean scrape 
      wall_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_WallType'}) 
      wall_type = wall_type1.find('div',{'class':'formControlReadonly'})  
      wall_type= wall_type.get_text().strip() 
      print('Wall Type:', wall_type) 

      #Glazing Type- clean scrape 
      glazing_type1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_GlazingType'}) 
      glazing_type =glazing_type1.find('div',{'class':'formControlReadonly'}) 
      glazing_type = glazing_type.get_text().strip() 
      print('Glazing Type:', glazing_type) 

      #Percent Low Energy Lighting- clean scrape 
      percent_low_energy_lighting1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_PercentLowEnergyLight'}) 
      percent_low_energy_lighting = percent_low_energy_lighting1.find('div',{'class':'formControlReadonly'})  
      percent_low_energy_lighting = percent_low_energy_lighting.get_text().strip() 
      print('% Low Energy Lighting:', percent_low_energy_lighting) 

      #Space Heating Fuel- clean scrape 
      space_heating_fuel1 =dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingFuel'}) 
      space_heating_fuel =space_heating_fuel1.find('div',{'class':'formControlReadonly'}) 
      space_heating_fuel = space_heating_fuel.get_text().strip() 
      print('Space Heating Fuel:',space_heating_fuel) 

      #Space Heating Efficiency- clean scrape 
      space_heating_efficiency1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingEfficiency'}) 
      space_heating_efficiency = space_heating_efficiency1.find('div',{'class':'formControlReadonly'})   
      space_heating_efficiency= space_heating_efficiency.get_text().strip() 
      print('Space Heating Efficiency:', space_heating_efficiency) 

      #Water Heatng Fuel- clean scrape 
      water_heating_fuel1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingFuel'}) 
      water_heating_fuel =water_heating_fuel1.find('div',{'class':'formControlReadonly'}) 
      water_heating_fuel = water_heating_fuel.get_text().strip() 
      print('Water Heating Fuel:', water_heating_fuel) 

      #Water Heating Efficiency- clean scrape 
      water_heating_efficiency1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingEfficiency'}) 
      water_heating_efficiency =water_heating_efficiency1.find('div',{'class':'formControlReadonly'})  
      water_heating_efficiency= water_heating_efficiency.get_text().strip() 
      print('Water Heating Efficiency:', water_heating_efficiency) 


      #thrid section 
      assessor_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsAssessor'}) 

      #Assessor Number- clean scrape 
      assessor_num1 = assessor_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfAssessor_container_AssessorNumber'}) 
      assessor_num = assessor_num1.find('div',{'class':'formControlReadonly'}) 
      assessor_num= assessor_num.get_text().strip() 
      print('Assessor Number:', assessor_num) 

      print('BER:', num) 

      print('\***************nSCRAPE FINISHED***************\n') 


      #Populate datebase  
      print('\nRECONNECTING WITH DATABASE') 
      with connection.cursor() as cursor: 
       print('SUCCESSFUL CONNECTION') 
       sql =("INSERT INTO table1(BER_number, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") 
       cursor.execute(sql, (num, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating)) 
       print('ROW POPULATED') 

#Calling the function 
web_scrape() 

#Metadata 
print('Gathering Details...') 
Run_time = datetime.now() - startTime 
print('Run Time:', Run_time) 

#Loop Finished   

print('\n***************PROGRAMME FINISHED***************') 
+0

您的代碼已經遍歷'hit_list'的每個元素並且每次都會觸發一個新的POST請求。你希望'payload_1'中的'num'完全改變嗎? –

+0

@AkshatMahajan,我想'num'改變價值。因此,在第一個'POST'請求​​中,num將是'100100403',第二個'POST'將是'100100965'等等。 –

回答

1

@PadraicCunningham提供了大部分的邏輯的這個答案,但正如我的評論下面他的回答所描述的,他的解決方案只能讓我走到一半。
我已經能夠建立他的工作來解決問題。
還有一步要完成,就是'點擊'中介'頁面,這導致了我想要挖掘的數據在哪裏。

爲我的非標準標籤和格式提前道歉。我是初學者。

import requests 
import pymysql.cursors 
from pymysql import connect, err, sys, cursors 
import sys 
import time 
import bs4 
import time 
from datetime import datetime 
import openpyxl 

hit_list = [100100403,100100965,100101047,100100874,100100783] #this is a sample list 
#Open page 
url = 'https://ndber.seai.ie/pass/ber/search.aspx' 


def field_update(s): 
    soup = bs4.BeautifulSoup(s.get(url).content,"html.parser") 
    return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"], 
    "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR") ["value"], 
    "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]} 
    print('field updated') 

with requests.session() as s: 
    for ber in hit_list: 
     payload_1 = { 
      'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': ber, 
      'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'} 
# update the post data with new token values 
    payload_1.update(field_update(s)) 
    r = s.post(url, data=payload_1) 

#'click through' intermediate page 
#THIS IS THE ADDITIONAL CODE THAT BUILDS ON PADRAIC'S ANSWER 
    soup = bs4.BeautifulSoup(r.content,"html.parser") 
    stage_two= { 
     "__EVENTTARGET": 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails', 
     "__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"], 
     "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"], 
     "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}   

    q=s.post(url, data=stage_two) 
    print('payload_2 posted')  
    soup = bs4.BeautifulSoup(q.content, 'html.parser') 


    print('\nBEGINNING SCRAPE....') 
    #FOR DATA TO BE SCRAPED, SEE ORIGINAL QUESTION 
2

你需要得到新的__EVENTVALIDATION令牌等等......每個崗位,你不能只是複製瀏覽器和硬編碼它們的值到您的文章數據:

import requests 

url = 'https://ndber.seai.ie/pass/ber/search.aspx' 
hit_list = [100100403, 100100965, 100101047, 100100874, 100100783] 
h = {} 


def renew(s): 
    soup = BeautifulSoup(s.get(url).content,"html.parser.) 
    return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"], 
      "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"], 
      "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]} 


with requests.session() as s: 
    for num in hit_list: 
     payload_1 = { 
      'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': num, 
      'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'} 
     # update the post data with new token values 
     payload_1.update(renew(s)) 
     r = s.post(url, data=payload_1) 

     # scrape the page 
     soup = BeautifulSoup(r.content, 'html.parser') 

如果我們運行代碼和分析了一下返回什麼,你可以看到,我們得到的每一頁正確:

In [8]: with requests.session() as s: 
    ...:   for num in hit_list: 
    ...:     payload_1 = { 
    ...:      'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': str(num), 
    ...:      'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'} 
    ...:     payload_1.update(renew(s)) 
    ...:     r = s.post(url, data=payload_1) 
    ...:     soup = BeautifulSoup(r.content, 'html.parser') 
    ...:     spans = soup.select("#ctl00_DefaultContent_BERSearch_gridRatings_gridview tr.GridRowStyle td span") 
    ...:     print(spans) 
    ...:   
[<span>BER</span>, <span>10003467711</span>, <span>07-01-2009</span>, <span>24 CLONEE COURT\rMAIN STREET\rCLONEE\rCO. MEATH</span>] 
[<span>BER</span>, <span>10301654014</span>, <span>26-11-2014</span>, <span>19 GORTANORA\rDINGLE\rCO. KERRY</span>] 
[<span>BER</span>, <span>10002082335</span>, <span>08-01-2009</span>, <span>8 CANNON PLACE\r1 HERBERT ROAD\rDUBLIN 4</span>] 
[<span>BER</span>, <span>10301653940</span>, <span>18-01-2015</span>, <span>12 GORTANORA\rDINGLE\rCO. KERRY</span>] 
[<span>BER</span>, <span>10010500405</span>, <span>07-01-2009</span>, <span>13 RENMORE ROAD\rGALWAY CITY</span>] 

這讓大家從表信息欄的BER證書號,你已經擁有了,所以你不需要擔心它。

當你想通了,你只需將數據傳遞到您從什麼是從第一後返回第二有效載荷,如果你在函數封裝邏輯,也使你的代碼更容易一點管理:

def renew(soup): 
    return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"], 
      "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"], 
      "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]} 


def parse_data(soup): 
    address = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress").text.strip() 
    MPRN = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_container_MPRN div.formControlReadonly").text.strip() 
    emissions_indicator, emissions_indicator_unit = soup.select_one(
     "#ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue").text.split() 
    emissions_indicator_unit = emissions_indicator_unit.strip("()") 
    BER_score, BER_actual_rating, BER_unit = soup.select_one(
     "#ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating").text.split() 
    BER_unit = BER_unit.strip("()") 
    return {"MPRN": MPRN, "emissions_indicator": emissions_indicator, 
      "emissions_indicator_unit": emissions_indicator_unit, 
      "BER_score": BER_score, "BER_actual_rating": BER_actual_rating, 
      "BER_unit": BER_unit, "address": address} 

def submint_to_db(dct): 
    with connection.cursor() as cursor: 
     print('SUCCESSFUL CONNECTION') 
     sql = "INSERT INTO table1 (%s) VALUES (%s)" % (",".join(dct), ', '.join(['%s'] * len(dct))) 
     cursor.execute(sql, dct.values()) 

payload_1 = { 
    'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'} 
payload_2 = { 
    '__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails', 
} 

with requests.session() as s: 
    tokens = renew(BeautifulSoup(requests.get(url).content, "html.parser")) 
    for num in hit_list: 
     # update the post data with new token values 
     payload_1['ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber'] = num 
     payload_1.update(tokens) 
     r = s.post(url, data=payload_1) 
     tokens2 = renew(BeautifulSoup(r.content, 'html.parser')) 
     payload_2.update(tokens2) 
     soup = BeautifulSoup(requests.post(url, data=payload_2).content, "html.parser") 
     submint_to_db(parse_data(soup)) 

我沒有分析所有的數據,但邏輯是其餘相同,打印類型的字典返回什麼是解析會給你:

{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '57.83', 'address': '24 CLONEE COURTMAIN STREETCLONEECO. MEATH', 'BER_score': 'D1', 'BER_actual_rating': '235.54', 'MPRN': '10003467711'} 
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '42.4', 'address': '19 GORTANORADINGLECO. KERRY', 'BER_score': 'C1', 'BER_actual_rating': '165.79', 'MPRN': '10301654014'} 
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '34.03', 'address': '8 CANNON PLACE1 HERBERT ROADDUBLIN 4', 'BER_score': 'C2', 'BER_actual_rating': '175.32', 'MPRN': '10002082335'} 
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '53.51', 'address': '12 GORTANORADINGLECO. KERRY', 'BER_score': 'C3', 'BER_actual_rating': '208.45', 'MPRN': '10301653940'} 
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '121.54', 'address': '13 RENMORE ROADGALWAY CITY', 'BER_score': 'G', 'BER_actual_rating': '472.19', 'MPRN': '10010500405'} 
+0

這太瘋狂了!您正在清除'beautifulsoup'標籤!別那樣做!:) – alecxe

+0

@alecxe,大聲笑,通常角色是相反的,你必須有一天休息:) –

+0

感謝您的回覆@PadraicCunningham,但不幸的是,只有我的一半。正如你在我原來的文章中看到的那樣,我使用'payload_2'來點擊中間頁面。我需要的數據不是您打印的數據(來自「中間頁面」),而是下一頁上的數據。我試圖在第二階段重新做你的解決方案,但沒有運氣。我認爲這個網址會導致問題,因爲它是每個網頁的相同網址。 –

相關問題