,你正在尋找的標題標籤TD標籤,如果打印coffee_titles你的代碼失敗,爲什麼你看到它None
是很清楚。
要獲取所有表數據,可以從表中拉日期,並把它們作爲鍵:
from bs4 import BeautifulSoup
from collections import OrderedDict
r = requests.get("http://www.investing.com/commodities/us-coffee-c-historical-data")
od = OrderedDict()
soup = BeautifulSoup(r.content,"lxml")
# select the table
table = soup.select_one("table.genTbl.closedTbl.historicalTbl")
# all col names
cols = [th.text for th in table.select("th")[1:]]
# get all rows bar the first i.e the headers
for row in table.select("tr + tr"):
# get all the data including the date
data = [td.text for td in row.select("td")]
# use date as the key and store list of values
od[data[0]] = dict(zip(cols, data[1:]))
from pprint import pprint as pp
pp(dict(od))
輸出:
{u'Jun 01, 2016': {u'Change %': u'0.29%',
u'High': u'123.10',
u'Low': u'120.85',
u'Open': u'121.50',
u'Price': u'121.90',
u'Vol.': u'18.55K'},
u'Jun 02, 2016': {u'Change %': u'0.90%',
u'High': u'124.40',
u'Low': u'122.15',
u'Open': u'122.50',
u'Price': u'123.00',
u'Vol.': u'22.11K'},
u'Jun 03, 2016': {u'Change %': u'3.33%',
u'High': u'127.40',
u'Low': u'122.50',
u'Open': u'122.60',
u'Price': u'127.10',
u'Vol.': u'28.47K'},
u'Jun 06, 2016': {u'Change %': u'3.62%',
u'High': u'132.05',
u'Low': u'127.10',
u'Open': u'127.30',
u'Price': u'131.70',
u'Vol.': u'30.65K'},
u'May 09, 2016': {u'Change %': u'2.49%',
u'High': u'126.60',
u'Low': u'123.28',
u'Open': u'125.65',
u'Price': u'126.53',
u'Vol.': u'-'},
u'May 10, 2016': {u'Change %': u'0.29%',
u'High': u'125.90',
u'Low': u'125.90',
u'Open': u'125.90',
u'Price': u'126.90',
u'Vol.': u'0.01K'},
u'May 11, 2016': {u'Change %': u'2.26%',
u'High': u'129.77',
u'Low': u'126.88',
u'Open': u'128.60',
u'Price': u'129.77',
u'Vol.': u'-'},
u'May 12, 2016': {u'Change %': u'-1.21%',
u'High': u'128.75',
u'Low': u'127.30',
u'Open': u'128.75',
u'Price': u'128.20',
u'Vol.': u'0.01K'},
u'May 13, 2016': {u'Change %': u'0.47%',
u'High': u'127.85',
u'Low': u'127.80',
u'Open': u'127.85',
u'Price': u'128.80',
u'Vol.': u'0.01K'},
u'May 16, 2016': {u'Change %': u'3.03%',
u'High': u'131.95',
u'Low': u'128.75',
u'Open': u'128.75',
u'Price': u'132.70',
u'Vol.': u'0.01K'},
u'May 17, 2016': {u'Change %': u'-0.64%',
u'High': u'132.60',
u'Low': u'132.60',
u'Open': u'132.60',
u'Price': u'131.85',
u'Vol.': u'-'},
u'May 18, 2016': {u'Change %': u'-1.93%',
u'High': u'129.65',
u'Low': u'128.15',
u'Open': u'128.85',
u'Price': u'129.30',
u'Vol.': u'0.02K'},
u'May 19, 2016': {u'Change %': u'-4.14%',
u'High': u'129.00',
u'Low': u'123.70',
u'Open': u'128.95',
u'Price': u'123.95',
u'Vol.': u'29.69K'},
u'May 20, 2016': {u'Change %': u'0.61%',
u'High': u'125.95',
u'Low': u'124.25',
u'Open': u'124.75',
u'Price': u'124.70',
u'Vol.': u'15.54K'},
u'May 23, 2016': {u'Change %': u'-2.04%',
u'High': u'124.70',
u'Low': u'122.00',
u'Open': u'124.50',
u'Price': u'122.15',
u'Vol.': u'15.89K'},
u'May 24, 2016': {u'Change %': u'-0.29%',
u'High': u'123.30',
u'Low': u'121.55',
u'Open': u'122.45',
u'Price': u'121.80',
u'Vol.': u'15.06K'},
u'May 25, 2016': {u'Change %': u'-0.33%',
u'High': u'122.95',
u'Low': u'121.20',
u'Open': u'122.45',
u'Price': u'121.40',
u'Vol.': u'18.11K'},
u'May 26, 2016': {u'Change %': u'0.08%',
u'High': u'122.15',
u'Low': u'121.20',
u'Open': u'121.90',
u'Price': u'121.50',
u'Vol.': u'19.27K'},
u'May 27, 2016': {u'Change %': u'-0.16%',
u'High': u'122.35',
u'Low': u'120.80',
u'Open': u'122.10',
u'Price': u'121.30',
u'Vol.': u'13.52K'},
u'May 31, 2016': {u'Change %': u'0.21%',
u'High': u'123.90',
u'Low': u'121.35',
u'Open': u'121.55',
u'Price': u'121.55',
u'Vol.': u'23.62K'}}
我們得到具體的日期,我們需要模仿和Ajax調用與後到http://www.investing.com/instruments/HistoricalDataAjax
:
from bs4 import BeautifulSoup
from collections import OrderedDict
# data to post
data = {"action": "historical_data",
"curr_id": "8832",
"st_date": "04/04/2016",
"end_date": "04/08/2016",
"interval_sec": "Daily"}
# add a user agent and specify that we are making an ajax request
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"}
with requests.Session() as s:
r = s.post("http://www.investing.com/instruments/HistoricalDataAjax", data=data, headers=head)
od = OrderedDict()
soup = BeautifulSoup(r.content, "lxml")
table = soup.select_one("table.genTbl.closedTbl.historicalTbl")
cols = [th.text for th in table.select("th")][1:]
for row in table.select("tr + tr"):
data = [td.text for td in row.select("td")]
od[data[0]] = dict(zip(cols, data[1:]))
from pprint import pprint as pp
pp(dict(od))
現在我們只能從日期範圍st_date到END_DATE:
{u'Apr 04, 2016': {u'Change %': u'-3.50%',
u'High': u'126.55',
u'Low': u'122.30',
u'Open': u'125.80',
u'Price': u'122.80',
u'Vol.': u'25.18K'},
u'Apr 05, 2016': {u'Change %': u'-1.55%',
u'High': u'122.85',
u'Low': u'120.55',
u'Open': u'122.85',
u'Price': u'120.90',
u'Vol.': u'25.77K'},
u'Apr 06, 2016': {u'Change %': u'0.50%',
u'High': u'122.15',
u'Low': u'120.00',
u'Open': u'121.45',
u'Price': u'121.50',
u'Vol.': u'17.94K'},
u'Apr 07, 2016': {u'Change %': u'-1.40%',
u'High': u'122.60',
u'Low': u'119.60',
u'Open': u'122.35',
u'Price': u'119.80',
u'Vol.': u'32.69K'}}
你可以看到在Chrome開發者工具後請求XHR選項卡下:
這是夢幻般的感謝你@padraiccunningham您的輸入謝謝你強調我沒有找到正確的標題標籤時遇到的問題,我相信這可能會被解釋爲初學者的疏忽!感謝您抽出一些時間來幫助解決我的問題。如果可以的話,你能解釋一下模仿和ajax調用post到[link](http://www.investing.com/instruments/HistoricalDataAjax)的步驟嗎?我不太明白那裏發生了什麼,或者爲什麼這是必要的以便打電話給特定日期? – da4l
我剛纔注意到的唯一問題是標題欄似乎錯開了。你的日期應該是你的價格,等等。由於「日期」列已將其推下,因此標題「更改%」未進入。我希望這是有道理的,在技術上你'日期'不必在 – da4l
@ da4l,我們不應該包括日期列,因爲這是我們在字典中的主要關鍵。對於在瀏覽器中檢索數據的ajax請求,打開開發人員工具並查看更改某些日期時會發生的情況,這不是一個簡單的get請求,其開始日期和結束日期爲params。 –