0
我是編程新手,我創建了一個使用美麗的湯蟒蛇webscraper,但是當我運行這個程序時,它打開python命令行,只是光標眨眼就可以了,什麼也沒有發生......現在我收到這些錯誤Beautifulsoup蟒蛇錯誤(10060)緊急
TimeoutError:[WinError 10060]連接嘗試失敗,因爲連接的方沒有正確一段時間後響應或已建立的連接失敗,因爲連接的主機沒有反應
ConnectionResetError :[WinError 10054]現有連接被遠程主機強制關閉
...請不要介意壓痕,
下面是我的代碼:
import urllib.request
import urllib
import json
import xml.etree.ElementTree as ET
import csv
from bs4 import BeautifulSoup
link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"
alldata = []
links = {}
certificatedata = []
def getData(url, values):
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
response=urllib.request.urlopen(req)
data = response.read()
data = data.decode("utf-8")
return data
def getDivsion():
## for now we are taking 6 districts.. it needs to updated when the data
gets updatedd
return range(1,7)
def getDistrict(divId):
global distlink
values = {'DivID': divId}
data = getData(distlink, values)
return data
def parseJson(data):
parsed = json.loads(data)
return parsed
def getTaluka(disId):
global talukaLink
values= {'DisID': disId}
data = getData(talukaLink, values)
return data
def getProjects(divId, disId):
global prjLink
values= {'DisID': disId, 'DivID': divId}
#print(values)
data = getData(prjLink, values)
if len(data)<10:
return "{}"
return data
def getProjectsList():
divList = getDivsion()
flag = 0
for divId in divList:
disData = getDistrict(divId)
disList = parseJson(disData)
for disObj in disList:
disId = disObj["ID"]
prjData = getProjects(divId, disId)
#print(" >>>> "+str(disId)+" >> "+str(divId))
#print(prjData)
prjJson = parseJson(prjData)
for prjObj in prjJson:
flag += 1
prjId = prjObj["ID"]
values = {'ID':0, 'pageTraverse': 1, 'Division': divId, 'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'}
finalPrjData = getData(link, values)
parseXMLData(finalPrjData)
#if len(alldata)>100:
# break
def parseXMLData(htmldata):
global alldata, links
soup = BeautifulSoup(htmldata, "html.parser")
tables = soup.find_all("table")
for table in tables:
print(len(alldata))
attr = table.attrs
if "table" in attr['class']:
tbody = table.find_all("tbody")
if len(tbody)>0:
tbody = tbody[0]
tr_lst = tbody.find_all("tr")
for tr in tr_lst:
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>6:
prjname = td_lst[1].text
proname = td_lst[2].text
certNo = td_lst[3].text
sublist.append(prjname)
sublist.append(proname)
sublist.append(certNo)
td = td_lst[4]
a_lst = td.find_all("a")
if len(a_lst)>0:
a = a_lst[0]
href = a.attrs['href']
link = "https://maharerait.mahaonline.gov.in/"+href
links[certNo] = link
sublist.append(link)
if len(sublist)>0:
alldata.append(sublist)
return alldata
def writedata(alldata1, filename):
print(" >>>> FINAL PRINTING DATA >>>> ")
#import pdb; pdb.set_trace()
with open("./"+filename,'w') as csvfile:
csvfile = csv.writer(csvfile, delimiter=',')
#csvfile.writerow(titleRow)
csvfile.writerow("")
for i in range(0, len(alldata1)):
#print(alldata1[i])
csvfile.writerow(alldata1[i] )
def processlinksforcert():
global links, certificatedata
print(">> Came in fetching certificates data >>> ")
for certno in links.keys():
link = links[certno]
htmldata = getData(link, {})
soup = BeautifulSoup(htmldata, "html.parser")
divs = soup.find_all("div")
for div in divs:
attr = div.attrs
if "id" in attr.keys() and "DivProfessional" in attr['id']:
table = div.find_all("table")
if len(table)<=0:
continue
t_attr = table[0].attrs
if "table" in t_attr["class"]:
print(len(certificatedata))
table = table[0]
tr_lst = table.find_all("tr")
index = 1
while index<len(tr_lst):
#import pdb; pdb.set_trace()
#for tr in tr_lst:
#if index==0:
# continue
tr = tr_lst[index]
index += 1
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>2:
sublist.append(certno)
pername = formattext(td_lst[0].text)
cerno = formattext(td_lst[1].text)
proftype = formattext(td_lst[2].text)
sublist.append(pername)
sublist.append(cerno)
sublist.append(proftype)
certificatedata.append(sublist)
return certificatedata
def formattext(text):
while text.find("\r\n")>=0:
text = text.replace("\r\n","")
while text.find(" ")>=0:
text = text.replace(" ","")
return text
def main():
global alldata, certificatedata
#data = getData(url, {})
getProjectsList()
print("Before write the projects data to the file. Count >>
"+str(len(alldata)))
writedata(alldata, "data.csv")
data = processlinksforcert()
print("Before write the certificates data to the file. Count >>
"+str(len(data)))
writedata(data, "certificates.csv")
main()
可以隨心所欲有人建議我在做什麼錯了......我把一切都安裝了PIP畫中畫beautifulsoup也..請不要介意縮進,它只是爲了這裏....
我覺得問題在於我甚至無法從我的瀏覽器「https://maharerait.mahaonline.gov.in/ SearchList/GetTaluka」訪問您的一些網址。我想可能你必須通過cookie或登錄信息到你的目的站點才能讓代碼獲取數據。到目前爲止,它無法訪問源,這就是爲什麼它在到期時間後崩潰。 – Grynets
但我在excel vba中爲同一個網站創建了奇才,它工作正常......但我注意到當我ping maharerait.mahaonline.gov.in時,它返回100%損失 –
您詢問了有關建議,並且因爲我看到問題不是與您的代碼。這個特殊的網站存在問題,因爲正如你所說的,你會得到100%的損失。這就是腳本無法正常工作的原因。您需要調查您的代碼和Excel VBA代碼之間的主要區別。在這裏,我無法幫助,因爲我沒有使用Excel VBA的經驗。但是,我建議這個問題隱藏在接近cookies或認證的地方。 – Grynets