2017-08-01 68 views
0

我是編程新手,我創建了一個使用美麗的湯蟒蛇webscraper,但是當我運行這個程序時,它打開python命令行,只是光標眨眼就可以了,什麼也沒有發生......現在我收到這些錯誤Beautifulsoup蟒蛇錯誤(10060)緊急

TimeoutError:[WinError 10060]連接嘗試失敗,因爲連接的方沒有正確一段時間後響應或已建立的連接失敗,因爲連接的主機沒有反應

ConnectionResetError :[WinError 10054]現有連接被遠程主機強制關閉

...請不要介意壓痕,

下面

是我的代碼:

import urllib.request 
import urllib 
import json 
import xml.etree.ElementTree as ET 
import csv 
from bs4 import BeautifulSoup 

link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist' 
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka" 
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict" 
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName" 

alldata = [] 

links = {} 
certificatedata = [] 

def getData(url, values): 
    data = urllib.parse.urlencode(values) 
    data = data.encode('utf-8') 
    req = urllib.request.Request(url, data) 
    response=urllib.request.urlopen(req) 
    data = response.read() 
    data = data.decode("utf-8") 
    return data 


def getDivsion(): 
    ## for now we are taking 6 districts.. it needs to updated when the data 
gets updatedd 
    return range(1,7) 

    def getDistrict(divId): 
     global distlink 
     values = {'DivID': divId} 
     data = getData(distlink, values) 
    return data 

def parseJson(data): 
    parsed = json.loads(data) 
    return parsed 

def getTaluka(disId): 
    global talukaLink 
    values= {'DisID': disId} 
    data = getData(talukaLink, values) 
    return data 

def getProjects(divId, disId): 
    global prjLink 
    values= {'DisID': disId, 'DivID': divId} 
    #print(values) 
    data = getData(prjLink, values) 
    if len(data)<10: 
    return "{}" 
return data 

def getProjectsList(): 
    divList = getDivsion() 
    flag = 0 
    for divId in divList: 
     disData = getDistrict(divId) 
     disList = parseJson(disData) 
     for disObj in disList: 
      disId = disObj["ID"] 
      prjData = getProjects(divId, disId) 
     #print(" >>>> "+str(disId)+" >> "+str(divId)) 
     #print(prjData) 
     prjJson = parseJson(prjData) 
     for prjObj in prjJson: 
      flag += 1 
      prjId = prjObj["ID"] 
      values = {'ID':0, 'pageTraverse': 1, 'Division': divId,  'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'} 
      finalPrjData = getData(link, values) 
      parseXMLData(finalPrjData) 
      #if len(alldata)>100: 
      # break 

def parseXMLData(htmldata): 
    global alldata, links 
    soup = BeautifulSoup(htmldata, "html.parser") 
    tables = soup.find_all("table") 
    for table in tables: 
     print(len(alldata)) 
    attr = table.attrs 
    if "table" in attr['class']: 
     tbody = table.find_all("tbody") 
     if len(tbody)>0: 
      tbody = tbody[0] 
      tr_lst = tbody.find_all("tr") 
      for tr in tr_lst: 
       sublist = [] 
       td_lst = tr.find_all("td") 
       if len(td_lst)>6: 
        prjname = td_lst[1].text 
        proname = td_lst[2].text 
        certNo = td_lst[3].text 
        sublist.append(prjname) 
        sublist.append(proname) 
        sublist.append(certNo) 
        td = td_lst[4] 
        a_lst = td.find_all("a") 
        if len(a_lst)>0: 
         a = a_lst[0] 
         href = a.attrs['href'] 
         link = "https://maharerait.mahaonline.gov.in/"+href 
         links[certNo] = link 
         sublist.append(link) 
       if len(sublist)>0: 
        alldata.append(sublist) 
return alldata 


def writedata(alldata1, filename): 
    print(" >>>> FINAL PRINTING DATA >>>> ") 
    #import pdb; pdb.set_trace() 
    with open("./"+filename,'w') as csvfile: 
     csvfile = csv.writer(csvfile, delimiter=',') 
     #csvfile.writerow(titleRow) 
     csvfile.writerow("") 
     for i in range(0, len(alldata1)): 
      #print(alldata1[i]) 
      csvfile.writerow(alldata1[i] ) 


def processlinksforcert(): 
    global links, certificatedata 
    print(">> Came in fetching certificates data >>> ") 
    for certno in links.keys(): 
     link = links[certno] 
     htmldata = getData(link, {}) 
     soup = BeautifulSoup(htmldata, "html.parser") 
     divs = soup.find_all("div") 
     for div in divs: 
      attr = div.attrs 
     if "id" in attr.keys() and "DivProfessional" in attr['id']: 
      table = div.find_all("table") 
      if len(table)<=0: 
       continue 
      t_attr = table[0].attrs 
      if "table" in t_attr["class"]: 
       print(len(certificatedata)) 
       table = table[0] 
       tr_lst = table.find_all("tr") 
       index = 1 
       while index<len(tr_lst): 
        #import pdb; pdb.set_trace() 
        #for tr in tr_lst: 
        #if index==0: 
        # continue 
        tr = tr_lst[index] 
        index += 1 
        sublist = [] 
        td_lst = tr.find_all("td") 
        if len(td_lst)>2: 
         sublist.append(certno) 
         pername = formattext(td_lst[0].text) 
         cerno = formattext(td_lst[1].text) 
         proftype = formattext(td_lst[2].text) 
         sublist.append(pername) 
         sublist.append(cerno) 
         sublist.append(proftype) 
         certificatedata.append(sublist) 
return certificatedata 

def formattext(text): 
    while text.find("\r\n")>=0: 
     text = text.replace("\r\n","") 

while text.find(" ")>=0: 
    text = text.replace(" ","") 
return text 

def main(): 
    global alldata, certificatedata 
    #data = getData(url, {}) 
    getProjectsList() 
    print("Before write the projects data to the file. Count >> 
"+str(len(alldata))) 
    writedata(alldata, "data.csv") 
    data = processlinksforcert() 
    print("Before write the certificates data to the file. Count >> 
"+str(len(data))) 
    writedata(data, "certificates.csv") 


main() 

可以隨心所欲有人建議我在做什麼錯了......我把一切都安裝了PIP畫中畫beautifulsoup也..請不要介意縮進,它只是爲了這裏....

+1

我覺得問題在於我甚至無法從我的瀏覽器「https://maharerait.mahaonline.gov.in/ SearchList/GetTaluka」訪問您的一些網址。我想可能你必須通過cookie或登錄信息到你的目的站點才能讓代碼獲取數據。到目前爲止,它無法訪問源,這就是爲什麼它在到期時間後崩潰。 – Grynets

+0

但我在excel vba中爲同一個網站創建了奇才,它工作正常......但我注意到當我ping maharerait.mahaonline.gov.in時,它返回100%損失 –

+1

您詢問了有關建議,並且因爲我看到問題不是與您的代碼。這個特殊的網站存在問題,因爲正如你所說的,你會得到100%的損失。這就是腳本無法正常工作的原因。您需要調查您的代碼和Excel VBA代碼之間的主要區別。在這裏,我無法幫助,因爲我沒有使用Excel VBA的經驗。但是,我建議這個問題隱藏在接近cookies或認證的地方。 – Grynets

回答

0

我解決了它通過使用硒。非常感謝大家