2017-04-13 104 views
0
<table cellspacing="0" rules="all" border="1" id="MainContent_grdUsers2" style="border-style:None;width:100%;border-collapse:collapse;"> 
        <tbody><tr class="listHeader"> 
         <th scope="col" style="width:11%;">Name</th><th scope="col" style="width:12%;">Password</th><th scope="col" style="width:16%;">Rights</th><th scope="col" style="width:10%;">Bureaus</th><th scope="col" style="width:15%;">FullName</th><th scope="col" style="width:16%;">Email</th><th scope="col" style="width:12%;">Status</th><th scope="col" style="width:12%;">Logon Tries</th> 
        </tr><tr> 
         <td>user1</td><td align="center"> 
                <input name="ctl00$MainContent$grdUsers2$ctl02$txtManageUsersPassword" type="text" maxlength="50" id="MainContent_grdUsers2_txtManageUsersPassword_0" style="width: 95%; background-image: url(&quot;data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAASCAYAAABSO15qAAAAAXNSR0IArs4c6QAAAUBJREFUOBGVVE2ORUAQLvIS4gwzEysHkHgnkMiEc4zEJXCMNwtWTmDh3UGcYoaFhZUFCzFVnu4wIaiE+vvq6+6qTgthGH6O4/jA7x1OiCAIPwj7CoLgSXDxSjEVzAt9k01CBKdWfsFf/2WNuEwc2YqigKZpK9glAlVVwTTNbQJZlnlCkiTAZnF/mePB2biRdhwHdF2HJEmgaRrwPA+qqoI4jle5/8XkXzrCFoHg+/5ICdpm13UTho7Q9/0WnsfwiL/ouHwHrJgQR8WEwVG+oXpMPaDAkdzvd7AsC8qyhCiKJjiRnCKwbRsMw9hcQ5zv9maSBeu6hjRNYRgGFuKaCNwjkjzPoSiK1d1gDDecQobOBwswzabD/D3Np7AHOIrvNpHmPI+Kc2RZBm3bcp8wuwSIot7QQ0PznoR6wYSK0Xb/AGVLcWwc7Ng3AAAAAElFTkSuQmCC&quot;); background-repeat: no-repeat; background-attachment: scroll; background-size: 16px 18px; background-position: 98% 50%; cursor: auto;" autocomplete="off"> 
               </td><td align="center"> 
                <select name="ctl00$MainContent$grdUsers2$ctl02$ddlManageUsersRights" id="MainContent_grdUsers2_ddlManageUsersRights_0" style="width:95%;"> 
          <option value="User">User</option> 
          <option selected="selected" value="Supervisor">Supervisor</option> 
          <option value="Administrator">Administrator</option> 
          <option value="Child Supervisor">Child Supervisor</option> 

         </select> 

               </td><td align="center"> 
                <select name="ctl00$MainContent$grdUsers2$ctl02$ddlManageUsersBureaus" id="MainContent_grdUsers2_ddlManageUsersBureaus_0" style="width:95%;"> 
          <option value="255">High</option> 
          <option selected="selected" value="128">Medium</option> 
          <option value="0">Low</option> 

         </select> 

               </td><td align="center"> 
                <input name="ctl00$MainContent$grdUsers2$ctl02$txtManageUsersFullName" type="text" value="First1 Last1" maxlength="50" id="MainContent_grdUsers2_txtManageUsersFullName_0" style="width: 95%; background-image: url(&quot;data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAASCAYAAABSO15qAAAAAXNSR0IArs4c6QAAAUBJREFUOBGVVE2ORUAQLvIS4gwzEysHkHgnkMiEc4zEJXCMNwtWTmDh3UGcYoaFhZUFCzFVnu4wIaiE+vvq6+6qTgthGH6O4/jA7x1OiCAIPwj7CoLgSXDxSjEVzAt9k01CBKdWfsFf/2WNuEwc2YqigKZpK9glAlVVwTTNbQJZlnlCkiTAZnF/mePB2biRdhwHdF2HJEmgaRrwPA+qqoI4jle5/8XkXzrCFoHg+/5ICdpm13UTho7Q9/0WnsfwiL/ouHwHrJgQR8WEwVG+oXpMPaDAkdzvd7AsC8qyhCiKJjiRnCKwbRsMw9hcQ5zv9maSBeu6hjRNYRgGFuKaCNwjkjzPoSiK1d1gDDecQobOBwswzabD/D3Np7AHOIrvNpHmPI+Kc2RZBm3bcp8wuwSIot7QQ0PznoR6wYSK0Xb/AGVLcWwc7Ng3AAAAAElFTkSuQmCC&quot;); background-repeat: no-repeat; background-attachment: scroll; background-size: 16px 18px; background-position: 98% 50%; cursor: auto;" autocomplete="off"> 
               </td><td align="center"> 
                <input name="ctl00$MainContent$grdUsers2$ctl02$txtManageUsersEmail" type="text" value="[email protected]" maxlength="50" id="MainContent_grdUsers2_txtManageUsersEmail_0" style="width: 95%; background-image: url(&quot;data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAASCAYAAABSO15qAAAAAXNSR0IArs4c6QAAAUBJREFUOBGVVE2ORUAQLvIS4gwzEysHkHgnkMiEc4zEJXCMNwtWTmDh3UGcYoaFhZUFCzFVnu4wIaiE+vvq6+6qTgthGH6O4/jA7x1OiCAIPwj7CoLgSXDxSjEVzAt9k01CBKdWfsFf/2WNuEwc2YqigKZpK9glAlVVwTTNbQJZlnlCkiTAZnF/mePB2biRdhwHdF2HJEmgaRrwPA+qqoI4jle5/8XkXzrCFoHg+/5ICdpm13UTho7Q9/0WnsfwiL/ouHwHrJgQR8WEwVG+oXpMPaDAkdzvd7AsC8qyhCiKJjiRnCKwbRsMw9hcQ5zv9maSBeu6hjRNYRgGFuKaCNwjkjzPoSiK1d1gDDecQobOBwswzabD/D3Np7AHOIrvNpHmPI+Kc2RZBm3bcp8wuwSIot7QQ0PznoR6wYSK0Xb/AGVLcWwc7Ng3AAAAAElFTkSuQmCC&quot;); background-repeat: no-repeat; background-attachment: scroll; background-size: 16px 18px; background-position: 98% 50%; cursor: auto;" autocomplete="off"> 
               </td><td align="center"> 
                <select name="ctl00$MainContent$grdUsers2$ctl02$ddlManageUsersStatus" id="MainContent_grdUsers2_ddlManageUsersStatus_0" style="width:95%;"> 
          <option value="Active">Active</option> 
          <option selected="selected" value="Inactive">Inactive</option> 
          <option value="Terminated">Terminated</option> 

         </select> 

               </td><td align="center">              
                <input name="ctl00$MainContent$grdUsers2$ctl02$txtManageUsersLogonTries" type="text" value="0" maxlength="1" id="MainContent_grdUsers2_txtManageUsersLogonTries_0" style="width:95%;"> 
               </td> 
        </tr><tr style="background-color:#CED6E7;"> 
         <td>user2</td><td align="center"> 
                <input name="ctl00$MainContent$grdUsers2$ctl03$txtManageUsersPassword" type="text" maxlength="50" id="MainContent_grdUsers2_txtManageUsersPassword_1" style="background-color: rgb(206, 214, 231); width: 95%; background-image: url(&quot;data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAASCAYAAABSO15qAAAAAXNSR0IArs4c6QAAAUBJREFUOBGVVE2ORUAQLvIS4gwzEysHkHgnkMiEc4zEJXCMNwtWTmDh3UGcYoaFhZUFCzFVnu4wIaiE+vvq6+6qTgthGH6O4/jA7x1OiCAIPwj7CoLgSXDxSjEVzAt9k01CBKdWfsFf/2WNuEwc2YqigKZpK9glAlVVwTTNbQJZlnlCkiTAZnF/mePB2biRdhwHdF2HJEmgaRrwPA+qqoI4jle5/8XkXzrCFoHg+/5ICdpm13UTho7Q9/0WnsfwiL/ouHwHrJgQR8WEwVG+oXpMPaDAkdzvd7AsC8qyhCiKJjiRnCKwbRsMw9hcQ5zv9maSBeu6hjRNYRgGFuKaCNwjkjzPoSiK1d1gDDecQobOBwswzabD/D3Np7AHOIrvNpHmPI+Kc2RZBm3bcp8wuwSIot7QQ0PznoR6wYSK0Xb/AGVLcWwc7Ng3AAAAAElFTkSuQmCC&quot;); background-repeat: no-repeat; background-attachment: scroll; background-size: 16px 18px; background-position: 98% 50%;" autocomplete="off"> 
               </td><td align="center"> 
                <select name="ctl00$MainContent$grdUsers2$ctl03$ddlManageUsersRights" id="MainContent_grdUsers2_ddlManageUsersRights_1" style="background-color:#CED6E7;width:95%;"> 
          <option value="User">User</option> 
          <option selected="selected" value="Supervisor">Supervisor</option> 
          <option value="Administrator">Administrator</option> 
          <option value="Child Supervisor">Child Supervisor</option> 

         </select> 

               </td><td align="center"> 
                <select name="ctl00$MainContent$grdUsers2$ctl03$ddlManageUsersBureaus" id="MainContent_grdUsers2_ddlManageUsersBureaus_1" style="background-color:#CED6E7;width:95%;"> 
          <option value="255">High</option> 
          <option selected="selected" value="128">Medium</option> 
          <option value="0">Low</option> 

         </select> 

               </td><td align="center"> 
                <input name="ctl00$MainContent$grdUsers2$ctl03$txtManageUsersFullName" type="text" value="First2 Last2" maxlength="50" id="MainContent_grdUsers2_txtManageUsersFullName_1" style="background-color: rgb(206, 214, 231); width: 95%; background-image: url(&quot;data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAASCAYAAABSO15qAAAAAXNSR0IArs4c6QAAAUBJREFUOBGVVE2ORUAQLvIS4gwzEysHkHgnkMiEc4zEJXCMNwtWTmDh3UGcYoaFhZUFCzFVnu4wIaiE+vvq6+6qTgthGH6O4/jA7x1OiCAIPwj7CoLgSXDxSjEVzAt9k01CBKdWfsFf/2WNuEwc2YqigKZpK9glAlVVwTTNbQJZlnlCkiTAZnF/mePB2biRdhwHdF2HJEmgaRrwPA+qqoI4jle5/8XkXzrCFoHg+/5ICdpm13UTho7Q9/0WnsfwiL/ouHwHrJgQR8WEwVG+oXpMPaDAkdzvd7AsC8qyhCiKJjiRnCKwbRsMw9hcQ5zv9maSBeu6hjRNYRgGFuKaCNwjkjzPoSiK1d1gDDecQobOBwswzabD/D3Np7AHOIrvNpHmPI+Kc2RZBm3bcp8wuwSIot7QQ0PznoR6wYSK0Xb/AGVLcWwc7Ng3AAAAAElFTkSuQmCC&quot;); background-repeat: no-repeat; background-attachment: scroll; background-size: 16px 18px; background-position: 98% 50%; cursor: auto;" autocomplete="off"> 
               </td><td align="center"> 
                <input name="ctl00$MainContent$grdUsers2$ctl03$txtManageUsersEmail" type="text" value="[email protected]" maxlength="50" id="MainContent_grdUsers2_txtManageUsersEmail_1" style="background-color: rgb(206, 214, 231); width: 95%; background-image: url(&quot;data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAASCAYAAABSO15qAAAAAXNSR0IArs4c6QAAAUBJREFUOBGVVE2ORUAQLvIS4gwzEysHkHgnkMiEc4zEJXCMNwtWTmDh3UGcYoaFhZUFCzFVnu4wIaiE+vvq6+6qTgthGH6O4/jA7x1OiCAIPwj7CoLgSXDxSjEVzAt9k01CBKdWfsFf/2WNuEwc2YqigKZpK9glAlVVwTTNbQJZlnlCkiTAZnF/mePB2biRdhwHdF2HJEmgaRrwPA+qqoI4jle5/8XkXzrCFoHg+/5ICdpm13UTho7Q9/0WnsfwiL/ouHwHrJgQR8WEwVG+oXpMPaDAkdzvd7AsC8qyhCiKJjiRnCKwbRsMw9hcQ5zv9maSBeu6hjRNYRgGFuKaCNwjkjzPoSiK1d1gDDecQobOBwswzabD/D3Np7AHOIrvNpHmPI+Kc2RZBm3bcp8wuwSIot7QQ0PznoR6wYSK0Xb/AGVLcWwc7Ng3AAAAAElFTkSuQmCC&quot;); background-repeat: no-repeat; background-attachment: scroll; background-size: 16px 18px; background-position: 98% 50%; cursor: auto;" autocomplete="off"> 
               </td><td align="center"> 
                <select name="ctl00$MainContent$grdUsers2$ctl03$ddlManageUsersStatus" id="MainContent_grdUsers2_ddlManageUsersStatus_1" style="background-color:#CED6E7;width:95%;"> 
          <option selected="selected" value="Active">Active</option> 
          <option value="Inactive">Inactive</option> 
          <option value="Terminated">Terminated</option> 

         </select> 

               </td><td align="center">              
                <input name="ctl00$MainContent$grdUsers2$ctl03$txtManageUsersLogonTries" type="text" value="0" maxlength="1" id="MainContent_grdUsers2_txtManageUsersLogonTries_1" style="background-color:#CED6E7;width:95%;"> 
</td> 
</tr> 
</tbody> 
</table> 

我試圖刮包含文本,下拉選項和值的表。結果如下: user1 |主管|中等| First1 Last1 | [email protected] | Inactive使用Beautifulsoup和Python刮複雜表

user2 |主管|中等| First2 Last2 | [email protected] |有效

意欲輸出到csv。到目前爲止,我有:

headers = [c.get_text(strip=True) for c in soup.find('tr', attrs={'class':'listHeader'}).findAll('th')] 

#find_all doesn't work here it just grabs one 
for table in soup.find('table', attrs={'id':'MainContent_grdUsers2'}): 
     try: 
      column3=(table.find("option", attrs={"selected": "selected"}).get('value')) 
     except: 
      continue 

#this only grabs a specific cell 
for table in soup.find('table', attrs={'id':'MainContent_grdUsers2'}): 
     try: 
      column6=(table.find("input", attrs={"id": "MainContent_grdUsers2_txtManageUsersEmail_0"}).get('value')) 
     except: 
      continue 

我可以去,單獨搶我想要的細胞,但也有大約100行的記錄,在這個表,我發現很難找出如何抓住它一下子因爲不僅有文本,而且還有下拉選項值和值。有沒有辦法用Beautifulsoup來做到這一點?我嘗試了熊貓和lxml,但之前從未使用過。

更新代碼:

headers = [c.get_text(strip=True) for c in soup.find('tr', attrs={'class':'listHeader'}).findAll('th')] 
table = soup.find('table', attrs={'id':'MainContent_grdUsers2'}) 
data = [] 

for tr in table.find_all('tr')[1:] : 
    td = tr.find_all('td') 
    try : 
     data += [ 
      [ 
       td[0].getText() , 
       td[2].find('option', {'selected':'selected'}).getText(), 
       td[3].find('option', {'selected':'selected'}).getText(), 
       td[4].find('input').get('value'), 
        if value is None: 
         continue 
       td[5].find('input').get('value'), 
       td[6].find('option', {'selected':'selected'}).getText() 
      ] 
     ] 
    except Exception as ex : 
     #print(ex) ## you can uncomment this line for debugging ## 
     continue 

for row in data : 
    print(' '.join(row)) 

回答

1

給你提供的HTML,這應該工作:

if soup.find('tr', attrs={'class':'listHeader'}) : 
    headers = [ 
     'none' if c is None else c.get_text(strip=True) 
     for c in soup.find('tr', attrs={'class':'listHeader'}).findAll('th') 
    ] 
else : 
    headers = None 

table = soup.find('table', attrs={'id':'MainContent_grdUsers2'}) 
data = [] 

for tr in table.find_all('tr')[1:] : 
    td = tr.find_all('td') 
    try : 
     data += [ 
      [ 
       td[0].getText() , 
       td[2].find('option', {'selected':'selected'}).getText(), 
       td[3].find('option', {'selected':'selected'}).getText(), 
       td[4].find('input').get('value'), 
       td[5].find('input').get('value'), 
       td[6].find('option', {'selected':'selected'}).getText() 
      ] 
     ] 
    except Exception as ex : 
     #print(ex) ## you can uncomment this line for debugging ## 
     continue 

for row in data : 
    print(' '.join(str(r) for r in row)) 

輸出:

user1 Supervisor Medium First1 Last1 [email protected] Inactive 
user2 Supervisor Medium First2 Last2 [email protected] Active 
+0

這是有道理的,但我發現了一個NoneType錯誤:td [2] .find('option',{'selected':'selected'})。getText(), AttributeError:'NoneType'對象沒有attr ibute'getText' – nvachhan

+0

你可以給我的網址或整個HTML表? –

+0

這個問題太大了,有沒有其他辦法可以爲你澄清? – nvachhan