嗨:我是Scraperwiki和Python的新手,並試圖弄清楚如何返回「NA」或類似的東西符合我的cssselect規範的網頁。如何在Python和ScraperWiki中給出空白值返回「N/A」
在我的代碼如下,我颳了一個雙嵌套的網頁集。當我刮掉一個沒有cssselect屬性值的子頁面時,它只會複製最後一個有值的刮頁面的值。
任何提示?謝謝!託德
進口scraperwiki 進口裏urlparse 進口lxml.html 進口的urllib
DEF scrape_table(根): 行= root.cssselect( 「H2」)
record = {}
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
record['Title'] = table_cells[0].text_content()
table_cellsurls = table_cells[0].cssselect("a")
record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
ids=caseroots.cssselect("div div div div a")
#turns out that the data i want is third and second instance. BUT THE PROBLEM I HAVE IS THAT IT COPIES THE PREVIOUS ROW IF NULL.
for i in ids:
if len(ids)>=2:
record['Rules']=ids[2].text_content()
record['Treaty']=ids[3].text_content()
else:
return None
#record['Rules']="NA"
#record['Treaty']="NA"
#pass
#print "None"
# As you can see, i have experimented with different ways of returning nothing.
pars = caseroots.cssselect("span.'case-doc-details'")
for par in pars:
for i in pars:
pars1=pars[0].cssselect("a")
if len(pars1)>=0:
record['DetailsURL']=pars1[0].attrib.get('href')
else:
return None
#Create a third level of scrape.
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars1[0].attrib.get('href')).read()
print caselinkurl2
caseroots2=lxml.html.fromstring(caselinkurl2)
pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")
for i in pars2:
if len(pars2)>=0:
record['Doc Date']=pars2[0].text_content()
else:
return None
pars3=caseroots2.cssselect("div.'field-item even' span.'file' a")
for i in pars3:
if len(pars3)>=0:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
else:
return None
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
for i in pars4:
if len(pars4)>=0:
record['Claimant Nominee']=pars4[0].text_content()
else:
return None
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
for i in pars5:
if len(pars5)>=0:
record['Respondent Nominee']=pars5[0].text_content()
else:
return None
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
for i in pars6:
if len(pars6)>=0:
record['President']=pars6[0].text_content()
else:
return None
print record, '------------'
scraperwiki.sqlite.save(["Title"],record)
DEF scrape_and_look_for_next_link(URL ): html = scraperwiki.scrape(url) print html root = lxml.html.fromstring(html) scrape_table(root)
從這裏開始:
URL = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All' scrape_and_look_for_next_link(URL)