2016-01-25 64 views
1

我正在使用python 3.4。我知道如何利用BeautifulSoup來抓取網頁,但我正在嘗試提出最有效的方法來完成此操作。 Nexus factory image page(Android)包含所有Nexus設備的列表,並在新版本可用時更新。最新的版本總是添加到相應表格的底部。我列出了每個設備的名稱,包括真實姓名和代碼名稱,並且我只提取這些設備(設備本身只更新一次/年,如果有的話,只有部分設備仍然會收到更新)。刮掉表格底部的行

什麼是最有效的方式拉出每個表的底部條目?我打算將底部行中第一個<td>的每個字符串保存爲pickled對象,以便稍後可以輕鬆地比較字符串以檢查當前最下面一行是否是新的,但我不確定最佳方法是什麼條目本身。

每個<tr>都有一個格式爲devnamebuildnumber的ID。由於我有每個設備的名稱,並將有最新的字符串,我應該能夠通過使用soup.find("tr", id=dev + buildstring)來搜索。然而,這將返回找到的行的每一個兄弟姐妹和孩子,所以我不知道如何最好地利用它。

回答

2

這是讓你開始的東西。這個想法是讓h2元素具有id屬性 - 除了第一個元素是設備名稱元素。對於找到的每個元素,我們獲取下一個table元素並將版本解析爲列表。執行:

from pprint import pprint 

import requests 
from bs4 import BeautifulSoup 


url = "https://developers.google.com/android/nexus/images" 
response = requests.get(url) 

soup = BeautifulSoup(response.content, "lxml") 

data = {} 
for device in soup.find_all("h2", id=True)[1:]: 
    device_name = device.get_text(strip=True) 

    data[device_name] = [version.find("td").get_text(strip=True) 
         for version in device.find_next("table").find_all("tr", id=True)] 

pprint(data) 

與打印設備名稱爲鍵和版本的字典作爲值:

{'"angler" for Nexus 6P': ['6.0.0 (MDA89D)', 
          '6.0.0 (MDB08K)', 
          '6.0.0 (MDB08L)', 
          '6.0.0 (MDB08M)', 
          '6.0.0 (MMB29N)', 
          '6.0.1 (MMB29M)', 
          '6.0.1 (MMB29P)'], 
'"bullhead" for Nexus 5X': ['6.0.0 (MDA89E)', 
          '6.0.0 (MDB08I)', 
          '6.0.0 (MDB08L)', 
          '6.0.0 (MDB08M)', 
          '6.0.1 (MMB29K)', 
          '6.0.1 (MMB29P)'], 
'"fugu" for Nexus Player': ['5.0 (LRX21M)', 
          '5.0 (LRX21V)', 
          '5.1.0 (LMY47D)', 
          '5.1.1 (LMY47V)', 
          '5.1.1 (LMY48J)', 
          '5.1.1 (LMY48N)', 
          '6.0.0 (MRA58K)', 
          '6.0.0 (MRA58N)', 
          '6.0.1 (MMB29M)', 
          '6.0.1 (MMB29T)'], 
'"hammerhead" for Nexus 5 (GSM/LTE)': ['4.4 (KRT16M)', 
             '4.4.2 (KOT49H)', 
             '4.4.3 (KTU84M)', 
             '4.4.4 (KTU84P)', 
             '4.4.4 Release 2 (For 2Degrees/NZ, ' 
             'Telstra/AUS and India ONLY) (KTU84Q)', 
             '5.0 (LRX21O)', 
             '5.0.1 (LRX22C)', 
             '5.1.0 (LMY47D)', 
             '5.1.0 (LMY47I)', 
             '5.1.1 (LMY48B)', 
             '5.1.1 (LMY48I)', 
             '5.1.1 (LMY48M)', 
             '6.0.0 (MRA58K)', 
             '6.0.0 (MRA58N)', 
             '6.0.1 (MMB29K)', 
             '6.0.1 (MMB29S)'], 
'"mantaray" for Nexus 10': ['4.2.2 (JDQ39)', 
          '4.3 (JWR66Y)', 
          '4.4 (KRT16S)', 
          '4.4.2 (KOT49H)', 
          '4.4.3 (KTU84L)', 
          '4.4.4 (KTU84P)', 
          '5.0 (LRX21P)', 
          '5.0.1 (LRX22C)', 
          '5.0.2 (LRX22G)', 
          '5.1.0 (LMY47D)', 
          '5.1.1 (LMY47V)', 
          '5.1.1 (LMY48I)', 
          '5.1.1 (LMY48M)', 
          '5.1.1 (LMY48T)', 
          '5.1.1 (LMY48X)', 
          '5.1.1 (LMY48Z)', 
          '5.1.1 (LMY49F)'], 
'"mysid" for Galaxy Nexus "toro" (Verizon CDMA/LTE)': ['4.0.4 (IMM76K)', 
                 '4.1.1 (JRO03O)', 
                 '4.2.2 (JDQ39)'], 
'"mysidspr" for Galaxy Nexus "toroplus" (Sprint CDMA/LTE)': ['4.1.1 (FH05)', 
                   '4.2.1 (GA02)'], 
'"nakasi" for Nexus 7 (Wi-Fi)': ['4.1.2 (JZO54K)', 
            '4.2.2 (JDQ39)', 
            '4.3 (JWR66Y)', 
            '4.4 (KRT16S)', 
            '4.4.2 (KOT49H)', 
            '4.4.3 (KTU84L)', 
            '4.4.4 (KTU84P)', 
            '5.0 (LRX21P)', 
            '5.0.2 (LRX22G)', 
            '5.1.0 (LMY47D)', 
            '5.1.1 (LMY47V)'], 
'"nakasig" for Nexus 7 (Mobile)': ['4.2.2 (JDQ39)', 
            '4.3 (JWR66Y)', 
            '4.4 (KRT16S)', 
            '4.4.2 (KOT49H)', 
            '4.4.3 (KTU84L)', 
            '4.4.4 (KTU84P)', 
            '5.0.2 (LRX22G)', 
            '5.1.0 (LMY47D)', 
            '5.1.1 (LMY47V)'], 
'"occam" for Nexus 4': ['4.2.2 (JDQ39)', 
         '4.3 (JWR66Y)', 
         '4.4 (KRT16S)', 
         '4.4.2 (KOT49H)', 
         '4.4.3 (KTU84L)', 
         '4.4.4 (KTU84P)', 
         '5.0 (LRX21T)', 
         '5.0.1 (LRX22C)', 
         '5.1.0 (LMY47O)', 
         '5.1.1 (LMY47V)', 
         '5.1.1 (LMY48I)', 
         '5.1.1 (LMY48M)', 
         '5.1.1 (LMY48T)'], 
'"razor" for Nexus 7 [2013] (Wi-Fi)': ['4.3 (JSS15Q)', 
             '4.3 (JSS15R)', 
             '4.4 (KRT16S)', 
             '4.4.2 (KOT49H)', 
             '4.4.3 (KTU84L)', 
             '4.4.4 (KTU84P)', 
             '5.0 (LRX21P)', 
             '5.0.1 (LRX22C)', 
             '5.0.2 (LRX22G)', 
             '5.1.0 (LMY47O)', 
             '5.1.1 (LMY47V)', 
             '5.1.1 (LMY48G)', 
             '5.1.1 (LMY48I)', 
             '5.1.1 (LMY48M)', 
             '5.1.1 (LMY48T)', 
             '6.0.0 (MRA58K)', 
             '6.0.0 (MRA58U)', 
             '6.0.0 (MRA58V)', 
             '6.0.1 (MMB29K)', 
             '6.0.1 (MMB29O)'], 
'"razorg" for Nexus 7 [2013] (Mobile)': ['4.3 (JLS36C)', 
              '4.3.1 (JLS36I)', 
              '4.4 (KRT16S)', 
              '4.4.2 (KOT49H)', 
              '4.4.2_r2 (Verizon) (KVT49L)', 
              '4.4.3 (KTU84L)', 
              '4.4.4 (KTU84P)', 
              '5.0.2 (LRX22G)', 
              '5.1.0 (LMY47O)', 
              '5.1.1 (LMY47V)', 
              '5.1.1 (LMY48P)', 
              '5.1.1 (LMY48U)', 
              '5.1.1 (LMY48X)', 
              '5.1.1 (LMY48Z)', 
              '6.0.0 (MRA58K)', 
              '6.0.0 (MRA58N)', 
              '6.0.0 (MRA58V)', 
              '6.0.0 (MRA59B)', 
              '6.0.1 (MMB29K)', 
              '6.0.1 (MMB29O)'], 
'"ryu" for Pixel C': ['6.0.1 (MXB48J)', '6.0.1 (MXB48K)'], 
'"shamu" for Nexus 6': ['5.0 (LRX21O)', 
         '5.0.1 (LRX22C)', 
         '5.1.0 (LMY47D)', 
         '5.1.0 (LMY47E)', 
         '5.1.0 (LMY47I)', 
         '5.1.0 (For T-Mobile ONLY) (LMY47M)', 
         '5.1.1 (All carriers except T-Mobile US) (LMY47Z)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28E)', 
         '5.1.1 (For Project Fi ONLY) (LVY48C)', 
         '5.1.1 (LMY48I)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28J)', 
         '5.1.1 (For Project Fi ONLY) (LVY48E)', 
         '5.1.1 (LMY48M)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28K)', 
         '5.1.1 (For Project Fi ONLY) (LVY48F)', 
         '5.1.1 (LMY48T)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28M)', 
         '5.1.1 (For Project Fi ONLY) (LVY48H)', 
         '5.1.1 (LMY48W)', 
         '5.1.1 (LMY48X)', 
         '5.1.1 (LMY48Y)', 
         '5.1.1 (For T-Mobile ONLY) (LYZ28N)', 
         '5.1.1 (For Project Fi ONLY) (LVY48I)', 
         '6.0.0 (MRA58K)', 
         '6.0.0 (MRA58N)', 
         '6.0.0 (MRA58R)', 
         '6.0.0 (MRA58X)', 
         '6.0.1 (MMB29K)', 
         '6.0.1 (MMB29S)'], 
'"soju" for Nexus S (worldwide version, i9020t and i9023)': ['2.3.6 (GRK39F)', 
                   '4.0.4 (IMM76D)', 
                   '4.1.2 (JZO54K)'], 
'"sojua" for Nexus S (850MHz version, i9020a)': ['2.3.6 (GRK39F)', 
                '4.0.4 (IMM76D)', 
                '4.1.2 (JZO54K)'], 
'"sojuk" for Nexus S (Korea version, m200)': ['2.3.6 (GRK39F)', 
               '4.0.4 (IMM76D)', 
               '4.1.1 (JRO03E)'], 
'"sojus" for Nexus S 4G (d720)': ['2.3.7 (GWK74)', 
            '4.0.4 (IMM76D)', 
            '4.1.1 (JRO03R)'], 
'"takju" for Galaxy Nexus "maguro" (GSM/HSPA+) (with Google Wallet)': ['4.0.4 ' 
                     '(IMM76I)', 
                     '4.1.2 ' 
                     '(JZO54K)', 
                     '4.2.2 ' 
                     '(JDQ39)', 
                     '4.3 ' 
                     '(JWR66Y)'], 
'"tungsten" for Nexus Q': ['4.0.4 (IAN67K)'], 
'"volantis" for Nexus 9 (Wi-Fi)': ['5.0 (LRX21Q)', 
            '5.0 (LRX21R)', 
            '5.0.1 (LRX22C)', 
            '5.0.2 (LRX22L)', 
            '5.1.1 (LMY47X)', 
            '5.1.1 (LMY48I)', 
            '5.1.1 (LMY48M)', 
            '5.1.1 (LMY48T)', 
            '6.0.0 (MRA58K)', 
            '6.0.0 (MRA58N)', 
            '6.0.1 (MMB29K)', 
            '6.0.1 (MMB29S)'], 
'"volantisg" for Nexus 9 (LTE)': ['5.0.1 (LRX22C)', 
            '5.0.2 (LRX22L)', 
            '5.1.1 (LMY47X)', 
            '5.1.1 (LMY48I)', 
            '5.1.1 (LMY48M)', 
            '5.1.1 (LMY48T)', 
            '5.1.1 (LMY48X)', 
            '5.1.1 (LMY48Z)', 
            '5.1.1 (LMY49F)', 
            '6.0.0 (MRA58K)', 
            '6.0.0 (MRA58N)', 
            '6.0.1 (MMB29K)', 
            '6.0.1 (MMB29S)'], 
'"yakju" for Galaxy Nexus "maguro" (GSM/HSPA+)': ['4.0.4 (IMM76I)', 
                '4.1.2 (JZO54K)', 
                '4.2.2 (JDQ39)', 
                '4.3 (JWR66Y)']} 
+0

這是我跟去了,雖然我調整了一下我的代碼。謝謝! – vaindil

1

下生成包含從每個設備的最後一個條目列表。要做到這一點,你仍然需要通過的所有項目進行迭代,但後來只保留最後一個條目,如下所示:

from bs4 import BeautifulSoup  
import requests 


html = requests.get("https://developers.google.com/android/nexus/images") 
soup = BeautifulSoup(html.text, "lxml") 
models = [] 

for h2 in soup.find_all('h2', id=True)[1:]: 
    tr = h2.find_next('table').find_all('tr', id=True)[-1] 
    td = [t.text.strip() for t in tr.find_all('td')] 
    models.append([h2.text] + td) 

for device, version, link, cs1, cs2 in models: 
    print '{}, {}'.format(device, version) 

這顯示以下內容:

"ryu" for Pixel C, 6.0.1 (MXB48K) 
"angler" for Nexus 6P, 6.0.1 (MMB29P) 
"bullhead" for Nexus 5X, 6.0.1 (MMB29P) 
"shamu" for Nexus 6, 6.0.1 (MMB29S) 
"fugu" for Nexus Player, 6.0.1 (MMB29T) 
"volantisg" for Nexus 9 (LTE), 6.0.1 (MMB29S) 
"volantis" for Nexus 9 (Wi-Fi), 6.0.1 (MMB29S) 
"hammerhead" for Nexus 5 (GSM/LTE), 6.0.1 (MMB29S) 
"razor" for Nexus 7 [2013] (Wi-Fi), 6.0.1 (MMB29O) 
"razorg" for Nexus 7 [2013] (Mobile), 6.0.1 (MMB29O) 
"mantaray" for Nexus 10, 5.1.1 (LMY49F) 
"occam" for Nexus 4, 5.1.1 (LMY48T) 
"nakasi" for Nexus 7 (Wi-Fi), 5.1.1 (LMY47V) 
"nakasig" for Nexus 7 (Mobile), 5.1.1 (LMY47V) 
"tungsten" for Nexus Q, 4.0.4 (IAN67K) 
"takju" for Galaxy Nexus "maguro" (GSM/HSPA+) (with Google Wallet), 4.3 (JWR66Y) 
"yakju" for Galaxy Nexus "maguro" (GSM/HSPA+), 4.3 (JWR66Y) 
"mysid" for Galaxy Nexus "toro" (Verizon CDMA/LTE), 4.2.2 (JDQ39) 
"mysidspr" for Galaxy Nexus "toroplus" (Sprint CDMA/LTE), 4.2.1 (GA02) 
"soju" for Nexus S (worldwide version, i9020t and i9023), 4.1.2 (JZO54K) 
"sojua" for Nexus S (850MHz version, i9020a), 4.1.2 (JZO54K) 
"sojuk" for Nexus S (Korea version, m200), 4.1.1 (JRO03E) 
"sojus" for Nexus S 4G (d720), 4.1.1 (JRO03R)