2015-05-25 92 views
0

我的一位朋友正在開發Scrapy腳本,以便從頁面中取消數據。 過了一段時間,我需要添加另一個字段。我成功添加了該字段。但問題是該字段沒有獲取td內部鏈接的數據。字段名稱是 「最後擊球手」卡住Scrapy上的數據爬行

數據URL:

http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385

的XPath的數據的:

// * [@ ID = 「ctl00_ContentPlaceHolder1_divData」] /表[6]/tr/td

import scrapy 
from bs4 import BeautifulSoup 
from scrapy.exceptions import CloseSpider 
from scrapy.selector import Selector 

from digicricket.items import ODIorTestItem 


class DigicricketMarsilOp1Spider(scrapy.Spider): 
    name = "digicricket.marssil.op1" 
    allowed_domains = ["digicricket.marssil.com"] 

def __init__(self, match_id=None): 
    if match_id: 
     match_id_list = match_id.split(',') 
     for i in match_id_list: 
      if not i.isdigit(): 
       raise CloseSpider('Match ID = {0} is not a number'.format(i)) 
     else: 
      self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i) 
           for i in match_id_list] 
    else: 
     raise CloseSpider('You forgot input Match ID/IDs') 

def parse(self, response): 
    item = ODIorTestItem() 
    item['Batsman_op1'] = [] 
    item['Bowler_op1'] = [] 
    item['other_op1'] = [] 
    sel = Selector(response) 
    tables = sel.xpath('//div[@id="ctl00_ContentPlaceHolder1_divData"]/table').extract() 
    row_for_other = dict() 
    for i in xrange(len(tables)): 
     html_text = BeautifulSoup(tables[i]) 
     if i == 1: 
      sl = 0 
      for tr in html_text.find_all('tr'): 
       td = tr.find_all('td') 
       if td: 
        sl += 1 
        row = dict() 
        row['sl'] = sl 
        row['match_id'] = response.url[response.url.rfind('=')+1:] 
        row["Batsman"] = td[0].get_text() 
        row["R"] = td[1].get_text() 
        row["B"] = td[2].get_text() 
        row["4s"] = td[3].get_text() 
        row["6s"] = td[4].get_text() 
        row["SR"] = td[5].get_text() 
        item['Batsman_op1'].append(row) 
     elif i == 2: 
      sl = 0 
      for tr in html_text.find_all('tr'): 
       td = tr.find_all('td') 
       if td: 
        sl += 1 
        row = dict() 
        row['sl'] = sl 
        row['match_id'] = response.url[response.url.rfind('=')+1:] 
        row["Bowler"] = td[0].get_text() 
        row["O"] = td[1].get_text() 
        row["M"] = td[2].get_text() 
        row["R"] = td[3].get_text() 
        row["W"] = td[4].get_text() 
        row["Econ"] = td[5].get_text() 
        item['Bowler_op1'].append(row) 
     else: 
      for tr in html_text.find_all('tr'): 
       td = tr.find_all('td') 

      if i == 0: 
       try: 
        row_for_other["InningsMatchDetails"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/' 
                    'table[1]/tr/td/b/text()[1]').extract()[0] 
       except: 
        row_for_other["InningsMatchDetails"] = None 
       try: 
        row_for_other["CurrentScore"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/' 
                   'table[1]/tr/td/b/span/text()').extract()[0] 
       except: 
        row_for_other["CurrentScore"] = None 
       try: 
        row_for_other["OversRunRate"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/' 
                   'table[1]/tr/td/b/text()[2]').extract()[0] 
       except: 
        row_for_other["OversRunRate"] = None 
       try: 
        row_for_other["Extras"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/table[1]/' 
                 'tr/td/b/text()[3]').extract()[0] 
       except: 
        row_for_other["Extras"] = None 
       try: 
        row_for_other["MatchResult"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/' 
                  'table[1]/tr/td/b/text()[4]').extract()[0] 
       except: 
        row_for_other["MatchResult"] = None 
       try: 
        row_for_other["RecentOvers"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/' 
                  'table[4]/tr/td[2]/text()').extract()[0] 
       except: 
        row_for_other["RecentOvers"] = None 
       try: 
        row_for_other["LastBatsman"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/' 
                  'table[6]/tr/td/text()').extract()[0] 
       except: 
        row_for_other["LastBatsman"] = None 

    row_for_other['match_id'] = response.url[response.url.rfind('=')+1:] 
    item['other_op1'].append(row_for_other) 
    return item 

回答

0

您的XPath似乎錯過了一些標記。在網頁上有第二個table之前的兩個div級別。用//代替/負責這些。 (因爲我的瀏覽器添加了一些<tbody>標籤也有在tr前面的雙斜線。

.//*[@id="ctl00_ContentPlaceHolder1_divData"]//table[6]//tr/td/ a [1]/text()