使用Python刮擦BeautifulSoup 4 - 初學者

感謝您對上一個問題的幫助（here）。不過，我目前仍然堅持讓我的最終數據框準備就緒。我已經能夠從原始表格中提取所有數據並將其顯示爲我想要的形式，但是現在我想將主隊和客隊添加到df中，但似乎無法弄清楚。這是我目前所擁有的，並且我想要刮掉here is the site。使用Python刮擦BeautifulSoup 4 - 初學者

from urllib.request import urlopen # import the library 
from bs4 import BeautifulSoup # Import BS 
from bs4 import SoupStrainer # Import Soup Strainer 
import pandas as pd # import pandas as a package 

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid=' 
matchid = '6172' 

scrapeweb1 = basescrape+matchid 

page = urlopen(scrapeweb1) # access the website 
only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing 
soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables) # parse the html 

only_teams = SoupStrainer('table', attrs={"width" : "376"}) # parse only team qtr score elements when parsing 
soup2 = BeautifulSoup(page, 'html.parser', parse_only=only_teams) # parse the html 


# only valid rows with player data in 

table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}) 


# create variables to keep the data in 

hometeam = [] 
awayteam = [] 

player = [] 
kicks = [] 
handballs = [] 
disposals = [] 
marks = [] 
goals = [] 
behinds = [] 
tackles = [] 
hitouts = [] 
inside50s = [] 
freesfor = [] 
freesagainst = [] 
fantasy = [] 
supercoach = [] 

# Find all the <tr> tag pairs, skip the first one, then for each. 
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}): 
    # Create a variable of all the <td> tag pairs in each <tr> tag pair, 
    col = row.find_all('td') 

    # Create a variable of the string inside 1st <td> tag pair, 
    column_1 = col[0].string.strip() 
    # and append it to player variable 
    player.append(column_1) 

    # Create a variable of the string inside 2nd <td> tag pair, 
    column_2 = col[1].string.strip() 
    # and append it to kicks variable 
    kicks.append(column_2) 

    # Create a variable of the string inside 3rd <td> tag pair, 
    column_3 = col[2].string.strip() 
    # and append it to handballs variable 
    handballs.append(column_3) 

    # Create a variable of the string inside 4th <td> tag pair, 
    column_4 = col[3].string.strip() 
    # and append it to disposals variable 
    disposals.append(column_4) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_5 = col[4].string.strip() 
    # and append it to marks variable 
    marks.append(column_5) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_6 = col[5].string.strip() 
    # and append it to goals variable 
    goals.append(column_6) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_7 = col[6].string.strip() 
    # and append it to behinds variable 
    behinds.append(column_7) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_8 = col[7].string.strip() 
    # and append it to tackles variable 
    tackles.append(column_8) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_9 = col[8].string.strip() 
    # and append it to hitouts variable 
    hitouts.append(column_9) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_10 = col[9].string.strip() 
    # and append it to inside50s variable 
    inside50s.append(column_10) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_11 = col[10].string.strip() 
    # and append it to freesfo variable 
    freesfor.append(column_11) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_12 = col[11].string.strip() 
    # and append it to freesagainst variable 
    freesagainst.append(column_12) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_13 = col[12].string.strip() 
    # and append it to fantasy variable 
    fantasy.append(column_13) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_14 = col[13].string.strip() 
    # and append it to supercoach variable 
    supercoach.append(column_14) 

# Find all the <tr> tag pairs, then for each. 
for row in soup2.find_all("tr", class_= "leftbold"): 
    # Create a variable of all the <td> tag pairs in each <tr> tag pair, 
    col2 = row.find_all('td') 

    # Create a variable of the string inside 1st <td> tag pair, 
    hometeam = col2[0].string.strip() 
    # and append it to player variable 
    # hometeam.append(column2_1) 

    # Create a variable of the string inside 2nd <td> tag pair, 
    awayteam = col2[1].string.strip() 
    # and append it to kicks variable 
    # awayteam.append(column2_2) 


# Create a variable of the value of the columns 
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach} 

# Create a dataframe from the columns variable - n 
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach']) 

print(df) 

# print(soup.prettify()) 

# print(table)

很顯然，數據幀不會工作，因爲數組並非全部長度相同。我該如何刮擦主客場球隊並將其保存到一個變量中，以使其以與matchid相同的方式工作？

還有一種方法可以讓「hometeam」變量出現在前22行，而「awayteam」出現在第23-44行？這樣的球員歸因於一個團隊？

我覺得我在這裏做這部分錯誤：

# Find all the <tr> tag pairs, then for each. 
for row in soup2.find_all("tr", class_= "leftbold"): 
    # Create a variable of all the <td> tag pairs in each <tr> tag pair, 
    col2 = row.find_all('td') 

    # Create a variable of the string inside 1st <td> tag pair, 
    hometeam = col2[0].string.strip() 
    # and append it to player variable 
    # hometeam.append(column2_1) 

    # Create a variable of the string inside 2nd <td> tag pair, 
    awayteam = col2[1].string.strip() 
    # and append it to kicks variable 
    # awayteam.append(column2_2)

非常感謝您的幫助。

（還有一個額外的問題，我無法使用「.join」運算符來運行scrapeweb1，因爲我已閱讀過在字符串中使用「+」不是最佳做法。失敗，低於）

scrapeweb1 = "".join(basescrape, matchid)

編輯：所以我檢查了源和它似乎有在該表中一些不正確的HTML ...

<table border="0" cellspacing="0" cellpadding="0" width="376" id="matchscoretable"> 
<tr> 
<th class="leftbold" height="23" width="100">Team</td>

它使用「/ TD」，而不是「/th「，當通過美麗的湯分析時會導致表格標籤關閉...

[<table border="0" cellpadding="0" cellspacing="0" id="matchscoretable" width="376"> 
<tr> 
<th class="leftbold" height="23" width="100">Team</th></tr></table>]

我可能要看看獲得主場和客場的球隊名字的另一種方式

來源

2017-02-22 MSalty

這裏有一種方法你可以做到這一點：

from urllib.request import urlopen # import the library 
from bs4 import BeautifulSoup # Import BS 
from bs4 import SoupStrainer # Import Soup Strainer 
import pandas as pd # import pandas as a package 

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid=' 
matchid = '6172' 
url = ''.join([basescrape,matchid]) 

# changed the table width to 585 to get first row with team name 
only_tables = SoupStrainer('table', attrs={"width" : "585"}) # parse only table elements when parsing 
soup = BeautifulSoup(urlopen(url), 'html.parser', parse_only=only_tables) # parse the html 
# use the table titles as anchor points 
teams = soup.find_all('td', attrs={'class':'innertbtitle', 'align':'left'}) 
# create an empty list for the players 
player_list = [] 
# iterate through anchor points 
for team in teams: 
    # extract team name from the table title 
    team_name = team.text.strip().split(' ', maxsplit=1)[0] 
    # get the rows from the next table relative to anchor point 
    trs = team.find_next('table', attrs={'width':583}).find_all('tr') 
    # create list of labels using first row in table 
    labels = [td.text for td in trs.pop(0).find_all('td')] 
    # iterate through the remaining rows 
    for row in trs: 
     # build dictionary using label as key and text of each cell as value 
     player_dict = {label:value.text for label,value in 
         zip(labels, row.find_all('td'))} 
     # add team name to dictionary 
     player_dict['team'] = team_name 
     # append dictionary to the list 
     player_list.append(player_dict) 

# create the dataframe 
df = pd.DataFrame(player_list) 
print(df)

來源

2017-02-22 09:37:04 Jay

非常感謝您花時間做到這一點，這是我的理解，這刮擦了所有的數據，並把它放在一個很好的數據框？編輯：我只是設法得到一些時間來運行它，哇，這是多麼整潔！非常感謝您的幫助。對不起，這樣的痛苦，但你能幫我理解每一行嗎？我拼命地嘗試改進我的python編碼。 – MSalty

我的理解到目前爲止（使用「players_list = []」作爲第1行） 1.創建一個名爲「players_list」的空變量 2.查看「團隊」中的每個「團隊」BS元素 3.團隊名稱是由頭部分配，由空格分開，返回第一個值（團隊名稱） 4.然後定義一個新的BS元素，它是我們要刮的實際表中的行 5.分配每行的標題爲DF創建標籤 6.創建一個循環遍歷表中的每一行 7.創建另一個循環，將每列的值賦給標題並將其存儲在「player_dict」中... – MSalty

8.檢索將每個'td'括號中的每個值存儲在正確的標籤中9。在'player_dict'中創建一個名爲'team'的新列，並將其分配到行 10.將完成的行添加到「player_list」變量 11.創建播放列表的數據框這一切是否正確？我錯過了某人嗎？再次感謝。 – MSalty

我設法解決這個問題，這裏現在已完成的代碼...

from urllib.request import urlopen # import the library 
from bs4 import BeautifulSoup # Import BS 
from bs4 import SoupStrainer # Import Soup Strainer 
import pandas as pd # import pandas as a package 

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid=' 
matchid = '6172' 

scrapeweb1 = basescrape+matchid 

page = urlopen(scrapeweb1) # access the website 
page2 = urlopen(scrapeweb1) # access the website 

only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing 

soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables) # parse the html 
soup2 = BeautifulSoup(page2, 'html.parser') # parse the html 


# only valid rows with player data in 

table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}) 


# create variables to keep the data in 

Table1 = soup2.find_all('table', attrs={'width':"375"})[1] 

hometeam = Table1.find_all('td', attrs={'width':"124"})[0].string.strip() 
awayteam = Table1.find_all('td', attrs={'width':"124"})[1].string.strip() 


player = [] 
kicks = [] 
handballs = [] 
disposals = [] 
marks = [] 
goals = [] 
behinds = [] 
tackles = [] 
hitouts = [] 
inside50s = [] 
freesfor = [] 
freesagainst = [] 
fantasy = [] 
supercoach = [] 

# Find all the <tr> tag pairs, skip the first one, then for each. 
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}): 
    # Create a variable of all the <td> tag pairs in each <tr> tag pair, 
    col = row.find_all('td') 

    # Create a variable of the string inside 1st <td> tag pair, 
    column_1 = col[0].string.strip() 
    # and append it to player variable 
    player.append(column_1) 

    # Create a variable of the string inside 2nd <td> tag pair, 
    column_2 = col[1].string.strip() 
    # and append it to kicks variable 
    kicks.append(column_2) 

    # Create a variable of the string inside 3rd <td> tag pair, 
    column_3 = col[2].string.strip() 
    # and append it to handballs variable 
    handballs.append(column_3) 

    # Create a variable of the string inside 4th <td> tag pair, 
    column_4 = col[3].string.strip() 
    # and append it to disposals variable 
    disposals.append(column_4) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_5 = col[4].string.strip() 
    # and append it to marks variable 
    marks.append(column_5) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_6 = col[5].string.strip() 
    # and append it to goals variable 
    goals.append(column_6) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_7 = col[6].string.strip() 
    # and append it to behinds variable 
    behinds.append(column_7) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_8 = col[7].string.strip() 
    # and append it to tackles variable 
    tackles.append(column_8) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_9 = col[8].string.strip() 
    # and append it to hitouts variable 
    hitouts.append(column_9) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_10 = col[9].string.strip() 
    # and append it to inside50s variable 
    inside50s.append(column_10) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_11 = col[10].string.strip() 
    # and append it to freesfo variable 
    freesfor.append(column_11) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_12 = col[11].string.strip() 
    # and append it to freesagainst variable 
    freesagainst.append(column_12) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_13 = col[12].string.strip() 
    # and append it to fantasy variable 
    fantasy.append(column_13) 

    # Create a variable of the string inside 5th <td> tag pair, 
    column_14 = col[13].string.strip() 
    # and append it to supercoach variable 
    supercoach.append(column_14) 




# Create a variable of the value of the columns 
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach} 

# Create a dataframe from the columns variable - n 
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach']) 

print(df)

來源

2017-02-22 09:25:43 MSalty

使用Python刮擦BeautifulSoup 4 - 初學者

回答

相關問題