2013-07-09 54 views
0

嗨:我是Scraperwiki和Python的新手,並試圖弄清楚如何返回「NA」或類似的東西符合我的cssselect規範的網頁。如何在Python和ScraperWiki中給出空白值返回「N/A」

在我的代碼如下,我颳了一個雙嵌套的網頁集。當我刮掉一個沒有cssselect屬性值的子頁面時,它只會複製最後一個有值的刮頁面的值。

任何提示?謝謝!託德

進口scraperwiki 進口裏urlparse 進口lxml.html 進口的urllib

DEF scrape_table(根): 行= root.cssselect( 「H2」)

record = {} 

for row in rows: 
    table_cells = row.cssselect("h2 a") 
    for cell in table_cells: 
     record['Title'] = table_cells[0].text_content() 
     table_cellsurls = table_cells[0].cssselect("a") 

     record['CaseURL'] = table_cellsurls[0].attrib.get('href') 

     caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read() 
     print caselinkurl 

     caseroots = lxml.html.fromstring(caselinkurl) 

     ids=caseroots.cssselect("div div div div a") 
     #turns out that the data i want is third and second instance. BUT THE PROBLEM I HAVE IS THAT IT COPIES THE PREVIOUS ROW IF NULL. 
     for i in ids: 
      if len(ids)>=2: 
       record['Rules']=ids[2].text_content() 
       record['Treaty']=ids[3].text_content() 
      else: 
       return None 
       #record['Rules']="NA" 
       #record['Treaty']="NA" 
       #pass 
       #print "None" 
# As you can see, i have experimented with different ways of returning nothing. 
     pars = caseroots.cssselect("span.'case-doc-details'") 


     for par in pars: 

      for i in pars:     
       pars1=pars[0].cssselect("a") 
       if len(pars1)>=0: 
        record['DetailsURL']=pars1[0].attrib.get('href') 
       else: 
        return None 

      #Create a third level of scrape. 
       caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars1[0].attrib.get('href')).read() 
       print caselinkurl2 
       caseroots2=lxml.html.fromstring(caselinkurl2) 
       pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'") 
       for i in pars2: 
        if len(pars2)>=0: 
         record['Doc Date']=pars2[0].text_content() 

        else: 

         return None 

       pars3=caseroots2.cssselect("div.'field-item even' span.'file' a") 
       for i in pars3:   
        if len(pars3)>=0:  
         record['Doc Type Link']=pars3[0].attrib.get('href') 
         record['Doc Type']=pars3[0].text_content()  
        else: 
         return None 

       pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'") 
       for i in pars4: 
        if len(pars4)>=0: 

         record['Claimant Nominee']=pars4[0].text_content() 
        else: 
         return None 

       pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'") 
       for i in pars5: 
        if len(pars5)>=0: 

         record['Respondent Nominee']=pars5[0].text_content() 
        else: 

         return None 

       pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'") 
       for i in pars6: 
        if len(pars6)>=0: 

         record['President']=pars6[0].text_content() 
        else: 

         return None 

     print record, '------------' 

     scraperwiki.sqlite.save(["Title"],record) 

DEF scrape_and_look_for_next_link(URL ): html = scraperwiki.scrape(url) print html root = lxml.html.fromstring(html) scrape_table(root)

從這裏開始:

URL = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All' scrape_and_look_for_next_link(URL)

回答

1

這裏回答了我自己的問題。

對於可能變成了一個空值,每個查詢,使用是這樣的:

 for par in pars: 
      pars1=pars[0].cssselect("a") 
      for i in pars1:     
       if len(pars)==0: 
        record['DetailsURL']="None" 
       else: 
        record['DetailsURL']=pars1[0].attrib.get('href')