python
  • web-scraping
  • 2015-06-30 55 views 0 likes 
    0
    import requests 
    from bs4 import BeautifulSoup 
    import csv 
    from urlparse import urljoin 
    import urllib2 
    
    outfile = open("./battingall.csv", "wb") 
    writer = csv.writer(outfile) 
    base_url = 'http://www.baseball-reference.com' 
    player_url = 'http://www.baseball-reference.com/players/' 
    alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] 
    players = 'shtml' 
    gamel = '&t=b&year=' 
    game_logs = 'http://www.baseball-reference.com/players/gl.cgi?id=' 
    years = ['2015','2014','2013','2012','2011','2010','2009','2008'] 
    
    drounders = [] 
    for dround in alphabet: 
        drounders.append(player_url + dround) 
    
    urlz = [] 
    for ab in drounders: 
        data = requests.get(ab) 
        soup = BeautifulSoup(data.content) 
        for link in soup.find_all('a'): 
         if link.has_attr('href'): 
          urlz.append(base_url + link['href']) 
    
    yent = [] 
    for ant in urlz: 
        for d in drounders: 
         for y in years: 
          if players in ant: 
           if len(ant) < 60: 
            if d in ant: 
             yent.append(game_logs + ant[44:-6] + gamel + y) 
    
    for j in yent: 
        try: 
         data = requests.get(j) 
         soup = BeautifulSoup(data.content) 
         table = soup.find('table', attrs={'id': 'batting_gamelogs'}) 
         tablea = j[52:59] 
         tableb= soup.find("b", text='Throws:').next_sibling.strip() 
         tablec= soup.find("b", text='Height:').next_sibling.strip() 
         tabled= soup.find("b", text='Weight:').next_sibling.strip() 
         list_of_rows = [] 
         for row in table.findAll('tr'): 
          list_of_cells = [] 
          list_of_cells.append(tablea) 
          list_of_cells.append(j[len(j)-4:]) 
          list_of_cells.append(tableb) 
          list_of_cells.append(tablec) 
          list_of_cells.append(tabled) 
          for cell in row.findAll('td'): 
           text = cell.text.replace('&nbsp;', '').encode("utf-8") 
           list_of_cells.append(text) 
          list_of_rows.append(list_of_cells) 
         print list_of_rows 
         writer.writerows(list_of_rows) 
        except (AttributeError,NameError): 
         pass 
    

    當我運行這段代碼來獲得gamelog擊球數據我不斷收到一個錯誤:錯誤在Python刮時,需要繞行

    Traceback (most recent call last): 
        File "battinggamelogs.py", line 44, in <module> 
        data = requests.get(j) 
        File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-  packages/requests/api.py", line 65, in get 
        return request('get', url, **kwargs) 
        File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 49, in request 
        response = session.request(method=method, url=url, **kwargs) 
        File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 461, in request 
        resp = self.send(prep, **send_kwargs) 
        File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 573, in send 
        r = adapter.send(request, **kwargs) 
        File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/adapters.py", line 415, in send 
        raise ConnectionError(err, request=request) 
    requests.exceptions.ConnectionError: ('Connection aborted.',  BadStatusLine("''",)) 
    

    我需要一種方法來繞過這個錯誤繼續下去。我認爲錯誤出現的原因是因爲沒有表格來獲取數據。

    +1

    它看起來像請求剛剛超時。嘗試導航到瀏覽器中的確切網址,看看會發生什麼。 – That1Guy

    回答

    2

    您可以將您的requests.get()塊封裝在try/except中。您需要捕獲正在生成的requests.exceptions.ConnectionError

    for ab in drounders: 
        try: 
         data = requests.get(ab) 
         soup = BeautifulSoup(data.content) 
         for link in soup.find_all('a'): 
          if link.has_attr('href'): 
           urlz.append(base_url + link['href']) 
        except requests.exceptions.ConnectionError: 
         pass 
    

    這是因爲連接本身存在問題,而不是因爲表中沒有數據。你甚至沒有那麼遠。

    注意:這是通過簡單地使用pass(因爲你也在代碼塊中稍後做)完全吃掉例外。做這樣的事情可能會更好:

    except requests.exceptions.ConnectionError: 
        print("Failed to open {}".format(ab)) 
    

    這會在控制檯上爲您提供哪些URL失敗的消息。

    相關問題