2017-04-07 31 views
0

我使用BeautifulSoup進行網頁抓取。 所以我需要分析這個表格:Google Finance
正如你可以看到有「年度數據」和「季度數據」。當我提取表python返回季度數據,但我不知道如何提取年度。有人知道嗎? 以下是表示此鏈接的HTML代碼。Web抓取。在Python中,我如何使用主動鏈接進行操作?

<div class="g-unit g-first"> 
 
View: 
 
<a id="interim" class="id-interim nac" target="_blank">Quarterly Data</a>&nbsp;|&nbsp; 
 
<a id="annual" class="id-annual ac" target="_blank">Annual Data</a> 
 
</div>

這裏是我的代碼:

import requests 
from bs4 import BeautifulSoup 
import pandas as pd 

raw_data = {'Param': ['Total Revenue', 'Cost of revenue', 'Gross profit', 
         'Operating expenses','Research Development'], 
      '2016': [123, 234343, 3423, 343, 323], 
      '2015': [3432423, 2342, 2342342, 356856, 36934], 
      '2014': [42, 52, 36, 24, 73], 
      '2013': [42, 52, 36, 24, 73]} 

url = 'https://www.google.com/finance?q=NASDAQ%3AAAPL&fstype=ii&ei=JQHoWMjKCcjDsAHAhqS4DA' 

r = requests.get(url) 
soup = BeautifulSoup(r.text, 'lxml') 
raw_df = pd.DataFrame(raw_data, columns = ['Param', '2016', '2015', '2014','2013']) 

# Find all the <tr> tag pairs, skip the first one, then for each. 

for row in table.find_all('tr')[1:]: 
    col = row.find_all('td') 
    column_1 = col[0].string.strip() 
    Revenue.append(column_1) 

    column_2 = col[1].string.strip() 
    _2016_.append(column_2) 

    column_3 = col[2].string.strip() 
    _2015_.append(column_3) 

    column_4 = col[3].string.strip() 
    _2014_.append(column_4) 

    column_5 = col[4].string.strip() 
    _2013_.append(column_5) 

columns = {'In Millions of USD': Revenue, '52 weeks ending 2016': _2016_, '52 weeks ending 2015': _2015_, '52 weeks ending 2014': _2014_, '52 weeks ending 2013': _2013_} 
df = pd.DataFrame(columns) 
+1

請分享您的代碼給我們。謝謝。 – anonyXmous

+0

感謝您的評論。完成 –

+0

@PiskarevDmitry,您是否嘗試過使用開發人員工具在網絡選項卡中從季度移動到年度時發送請求,特別是查看與它一起發送的不同頭文件,您需要根據需要設置引用程序和其他頭文件獲取你需要的數據。 – JkShaw

回答

0

您的代碼不工作,但我得到你真正想要的東西的想法。 您想將年度數據表格轉換爲熊貓數據框。 希望這有助於。

import requests 
from bs4 import BeautifulSoup 
import pandas as pd 

params, _2016_, _2015_, _2014_, _2013_ = [], [], [], [], [] 
url = 'https://www.google.com/finance?q=NASDAQ%3AAAPL&fstype=ii&ei=JQHoWMjKCcjDsAHAhqS4DA' 

r = requests.get(url) 
soup = BeautifulSoup(r.text, 'lxml') 
#print(soup)            #to get an idea on the class id 
table = soup.find("div", {"class": "id-incannualdiv"}) #this is the table for annual data report 
rows = [t.text for t in table.find_all("td")]   #get all rows then convert rows into columns 
i=0 
for r in rows: 
    if i%5 == 0: 
     params.append(r.rstrip("\r\n")) 
    if i%5 == 1: 
     _2016_.append(r) 
    if i%5 == 2: 
     _2015_.append(r) 
    if i%5 == 3: 
     _2014_.append(r) 
    if i%5 == 4: 
     _2013_.append(r) 
    i+=1 
df = pd.DataFrame(list(zip(params, _2016_, _2015_, _2014_, _2013_)), \ 
columns=['In Millions of USD', '52 weeks ending 2016', '52 weeks ending 2015', '52 weeks ending 2014', '52 weeks ending 2013']) 
df.head() 

enter image description here

0

只需添加表數組作爲兼具年,夸脫數據表

import requests 
from bs4 import BeautifulSoup 
import pandas as pd 

raw_data = {'Param': ['Total Revenue', 'Cost of revenue', 'Gross profit', 
         'Operating expenses','Research Development'], 
      '2016': [123, 234343, 3423, 343, 323], 
      '2015': [3432423, 2342, 2342342, 356856, 36934], 
      '2014': [42, 52, 36, 24, 73], 
      '2013': [42, 52, 36, 24, 73]} 

url = 'https://www.google.com/finance?q=NASDAQ%3AAAPL&fstype=ii&ei=JQHoWMjKCcjDsAHAhqS4DA' 

r = requests.get(url) 
soup = BeautifulSoup(r.text, 'lxml') 
raw_df = pd.DataFrame(raw_data, columns = ['Param', '2016', '2015', '2014','2013']) 
Revenue,_2016_,_2015_,_2014_,_2013_=[],[],[],[],[] 
# Find all the <tr> tag pairs, skip the first one, then for each. 
table=soup.find_all('table')[1:3] 
for tab in table: 
    for row in tab.find_all('tr')[1:]: 
     col = row.find_all('td') 
     column_1 = col[0].string.strip() 
     Revenue.append(column_1) 

     column_2 = col[1].string.strip() 
     _2016_.append(column_2) 

     column_3 = col[2].string.strip() 
     _2015_.append(column_3) 

     column_4 = col[3].string.strip() 
     _2014_.append(column_4) 

     column_5 = col[4].string.strip() 
     _2013_.append(column_5) 

columns = {'In Millions of USD': Revenue, '52 weeks ending 2016': _2016_, '52 weeks ending 2015': _2015_, '52 weeks ending 2014': _2014_, '52 weeks ending 2013': _2013_} 
df = pd.DataFrame(columns) 
相關問題