2015-05-30 19 views
1
import requests 
from bs4 import BeautifulSoup 
import csv 
from urlparse import urljoin 
import urllib2 


base_url = 'http://www.baseball-reference.com' 
data = requests.get("http://www.baseball-reference.com/teams/BAL/2014-schedule-scores.shtml") 
soup = BeautifulSoup(data.content) 
outfile = open("./Balpbp.csv", "wb") 
writer = csv.writer(outfile) 

url = [] 
for link in soup.find_all('a'): 

    if not link.has_attr('href'): 
     continue 

    if link.get_text() != 'boxscore': 
     continue 

    url.append(base_url + link['href']) 

for list in url: 
    response = requests.get(list) 
    html = response.content 
    soup = BeautifulSoup(html) 


    table = soup.find('table', attrs={'id': 'play_by_play'}) 

    list_of_rows = [] 
    for row in table.findAll('tr'): 
     list_of_cells = [] 
     for cell in row.findAll('td'): 
      text = cell.text.replace(' ', '') 
      list_of_cells.append(text) 
     list_of_rows.append(list_of_cells) 
    writer.writerows(list_of_rows) 

u'G ASCII錯誤\ xa0Holland 'u'N \ xa0Cruz。' ......錯誤數據寫入CSV由於在Python

以下是錯誤消息:

Traceback (most recent call last): 
    File "try.py", line 40, in <module> 
    writer.writerows(list_of_rows) 
UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 57: ordinal not in range(128) 

當我將數據寫入csv時,我最終得到的數據中包含\ x ...數據塊中的東西,這些數據可以防止數據寫入csv。我怎樣才能改變數據來刪除這部分數據或者做些什麼來規避這個問題?

回答

3

不能使用unicode與python2 csv模塊,你需要encode字符串:

注意

這個版本csv模塊不支持Unicode輸入。此外,目前有關於ASCII NUL字符的一些問題。因此,所有輸入應該是UTF-8或可打印的ASCII,以確保安全;請參閱示例部分中的示例。編碼之後

text = cell.text.replace('&nbsp;', '').encode("utf-8") 

輸出:

Top of the 1st, Red Sox Batting, Tied 0-0, Orioles' Chris Tillman facing 1-2-3 
" 
t1,0-0,0,---,"7,(2-2) CBBFFFX",O,BOS,D. Nava,C. Tillman,2%,52%,Groundout: P-1B (P's Right) 
t1,0-0,1,---,"4,(1-2) BCFX",,BOS,D. Pedroia,C. Tillman,-2%,50%,Single to RF (Line Drive to Short RF) 
t1,0-0,1,1--,"5,(1-2) CFBFT",O,BOS,D. Ortiz,C. Tillman,3%,52%,Strikeout Swinging 
t1,0-0,2,1--,"4,(0-2) C1CFS",O,BOS,M. Napoli,C. Tillman,2%,55%,Strikeout Swinging 
,,,,,,,,,"0 runs, 1 hit, 0 errors, 1 LOB. Red Sox 0, Orioles 0." 
"Bottom of the 1st, Orioles Batting, Tied 0-0, Red Sox' Jon Lester facing 1-2-3 
" 
b1,0-0,0,---,"4,(1-2) CBFX",O,BAL,N. Markakis,J. Lester,-2%,52%,Groundout: 3B-1B (Weak 3B) 
b1,0-0,1,---,"6,(3-2) BBFFBX",,BAL,J. Hardy,J. Lester,2%,55%,Single to LF (Line Drive) 
b1,0-0,1,1--,"4,(1-2) FBSX",O,BAL,A. Jones,J. Lester,-3%,52%,Popfly: SS (Deep SS) 
b1,0-0,2,1--,"5,(1-2) FFBFS",O,BAL,C. Davis,J. Lester,-2%,50%,Strikeout Swinging 
....................................