2015-04-24 13 views
0

因此,我創建了一個網頁刮板,進入cfbstats.com/2014/player/index.html並檢索所有大學橄欖球隊和鏈接的足球隊。從那裏進入每個環節,並把名單和球員鏈接。最後,它會進入每個玩家的鏈接,並採取他的統計。美麗的湯條HTML標題標籤,th類標籤和檢索數據不在列表中

我目前遇到球員統計問題。當我調用每個表格的標題時,我會得到打印的輸出[Tackle],當調用表格的第一行時,我會得到[G]。我想擺脫這些標籤。我一直無法獲得他們的過去職能。任何幫助,將不勝感激。

import csv 
import sys 
import json 
import urllib 
import requests 
from bs4 import BeautifulSoup 
import xlrd 
import xlwt 






def getCollegeandURL(): 



    f = open('colleges.csv', 'w') 

    f.write("Teams" + "," + "," + "URL" + '\n') 
    originalurl = "http://www.cfbstats.com/2014/player/index.html" 
    base = requests.get("http://www.cfbstats.com/2014/player/index.html") 
    base = base.text 
    soup = BeautifulSoup(base) 





    # this is to find all the colleges in the div conference 
    mydivs = soup.find_all('div',{'class': 'conference'}) 

    ##g is an excel document for the roster 
    g = open('rosters.csv', 'w') 
    g.write("College Rosters" + '\n' + '\n' + 'College' + ',' + 'Playernumber' + ',' + 'Player Last Name' + ',' +'Player First Name' + ',' + 'Position' + ',' + 'Year' + ',' + 'Height' + ',' + ' Weight' + ',' +'Hometown' + ',' + 'State' + ',' + 'Last School' + ',' + '\n') 

    # h is an excel for each player stats 
    h = xlwt.Workbook() 

    # this for loop finds writes each college to a line 
    for div in mydivs: 
     urls= div.findAll('a') 


    # this is to pull all the college names and each of their links 
     for url in urls: 


      college = url.text 
      url = url.attrs['href'] 

      teamurl = originalurl[:23]+url 

      f.write(college[:]+ ',' + ',' + teamurl[:]+'\n') 


      scrapeRosters(college, teamurl, g, h) 







############################################################################ 
def scrapeRosters(college, teamurl, g, h): 

    # create the excel documents 
    # this gets the pages of teams 
    roster = requests.get(teamurl) 
    roster = roster.text 
    roster = BeautifulSoup(roster) 






    teamname = roster.find_all('h1' , {'id': 'pageTitle'}) 

    teamAndPlayers = {} 
    table = roster.find_all('table', {'class' : 'team-roster'}) 



    for i in table: 

     rows = i.find_all('tr') 



     for row in rows[1:]: 

      data = [str(i.getText()) for i in row('td')] 
      link = row('td')[1]('a') 





      if len(link) > 0: 
       link = str(link[0]['href']) 
       data = [str(link)] + data 

       # unpacking data into variables 

       (playerurl, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool) = data 

       # creating the full player url 
       playerurl = teamurl[:23] + playerurl 

       # repacking the data 

       data = (college, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool) 





       g.write(college + ',' + playernumber + ',' + playerName + ',' + playerPosition + ','+ YearinCollege + ',' + playerHeight + ',' + playerWeight + ',' + playerHometown + ',' + lastSchool+ ',' + ',' + playerurl + ',' + '\n') 



       playerStats(data, playerurl, h)  


############################################################################ 
def playerStats(data,playerurl, h): 

    playerurl = requests.get(playerurl) 
    playerurl = playerurl.text 
    playerurl = BeautifulSoup(playerurl) 



    tablestats = playerurl.find_all('table', {'class' : 'player-home'}) 


    (college, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool) = data 

    #print college, playernumber, playerName 

    print college, playerName, playernumber 

    for x in tablestats: 

     caption = x.find_all('caption') 







     rows = x.find_all('tr') 

##  caption = caption.strip 

     for row in rows: 
      headers = x.find_all('th') 

      headers = [str(i.getText()) for i in row('tr')] 

      stats = [str(x.getText()) for x in row('td')] 

      print caption, headers, stats 


############################################################################ 
def main(): 
    getCollegeandURL() 



main()  

回答

0

不要這麼努力工作,您的數據是already available in parseable form

+0

好吧,我知道。我正在嘗試自己創建網頁剪貼簿。 –

+0

同樣,爲什麼?該API的設計使webage可以被改變,並且仍然保持你的代碼不被破壞。 – hd1