2015-04-23 94 views
0

好吧我之前並不清楚。所以我想要做的是從http://www.cfbstats.com/2014/player/index.html獲得大學隊的名單和他們的網址,並導出到csv。我已經成功地做到了。從那裏我將進入每個球隊並抓住每個球員和他們的聯繫。如果玩家沒有鏈接,那麼它會將他們的數據放入csv中。我目前只有擁有網址但沒有網址的玩家。最終,我會想要進入每個球員,並抓住他們的每個統計信息,並寫入一個CSV。檢索數據和鏈接美麗的湯,並檢索數據,如果鏈接不存在

對不起原來帖子中的所有混淆。

import csv 
import sys 
import json 
import urllib 
import requests 
from bs4 import BeautifulSoup 




def getCollegeandURL(): 



    f = open('colleges.csv', 'w') 

    f.write("Teams" + "," + "," + "URL" + '\n') 
    originalurl = "http://www.cfbstats.com/2014/player/index.html" 
    base = requests.get("http://www.cfbstats.com/2014/player/index.html") 
    base = base.text 
    soup = BeautifulSoup(base) 





    # this is to find all the colleges in the div conference 
    mydivs = soup.find_all('div',{'class': 'conference'}) 

    ##g 
    g = open('rosters.csv', 'w') 
    g.write("College Rosters" + '\n' + '\n' + 'College' + ',' + ',' + 'Playernumber' + ',' + 'Player Last Name' + ',' +'Player First Name' + ',' + 'Position' + ',' + 'Year' + ',' + 'Height' + ',' + ' Weight' + ',' +'Hometown' + ',' + 'State' + ',' + 'Last School' + ',' + '\n') 


    # this for loop finds writes each college to a line 
    for div in mydivs: 
     urls= div.findAll('a') 


    # this is to pull all the college names and each of their links 
    for url in urls: 


     college = url.text 
     url = url.attrs['href'] 

     teamurl = originalurl[:23]+url 

     f.write(college[:]+ ',' + ',' + teamurl[:]+'\n') 


     scrapeRosters(college, teamurl, g) 








def scrapeRosters(college, teamurl, g): 

# g is the excel document to read into 
# college is the college name 
# teamurl is the url link to that team's roster 

roster = requests.get(teamurl) 
roster = roster.text 
roster = BeautifulSoup(roster) 

teamname = roster.find_all('h1' , {'id': 'pageTitle'}) 

teamAndPlayers = {} 
table = roster.find_all('table', {'class' : 'team-roster'}) 

for i in table: 

    rows = i.find_all('tr') 


    for row in rows[1:]: 

     # this retrieves the player url 
     for item in row.findAll('a'): 

      if item not in row.findAll('a'): 

       row = row.text 
       row = row.split('\n') 
       row = str(row) 

       g.write(college + ',' + row + ',' + ',' + '\n') 

      elif (item['href'].startswith('/')): 
       playerurl = item.attrs['href'] 


       row = row.text 
       row = row.split('\n') 

       row = str(row) 


       g.write(college + ',' + row + ',' + ',' + playerurl + ',' + '\n') 

def main(): 
    getCollegeandURL() 



main()  

我相信這個錯誤在我的if和elif語句中。

回答

0
import urllib, bs4 

data = urllib.urlopen('http://www.cfbstats.com/2014/team/140/roster.html') 
soup = bs4.BeautifulSoup(data.read()) # creates a BS4 HTML parsing object 

for row in soup('tr')[1:]: 
    data = [str(i.getText()) for i in row('td')] 
    link = row('td')[1]('a') # the linked player 

    if len(link) > 0: 
     link = str(link[0]['href']) 
     data = [str(link)] + data 

    print data 
    print '\n' 
+0

這仍然只打印與鏈接不是沒有...的球員,但它給我一個python字符串,而不是unicode! –

+0

那麼你想要所有的球員沒有鏈接? –

+0

因爲這得到他們所有人,鏈接或沒有鏈接。 –