2016-11-24 34 views
0

我嘗試通過發送json請求來抓取this link。我的第一個要求是:獲取HTML元素並在python中發送新的json請求

parameters1 = {'ticker':'XOM', 'countryCode':'US', 
     'dateTime':'', 'docId':'1222737422 ', 
     'docType':'806','sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2', 
     'messageNumber':'','count':'10', 
     'channelName':'/news/latest/company/us/xom', 'topic':'', 
     '_':'' } 
firstUrl = "http://www.marketwatch.com/news/headline/getheadlines" 
html1 = requests.get(firstUrl, params = parameters1, headers = header) 
html_json1=(json.loads(html1.text)) 

發送下一個請求,我必須從相應的HTML中提取的docId並將其添加到新的參數。我不知道該怎麼做。你知道如何在發送json請求之後得到新的HTML嗎?

+0

html_json1有什麼功能? – Backtrack

回答

1
import requests 
import json 

from bs4 import BeautifulSoup 


def main(): 

    html_url = 'http://www.marketwatch.com/investing/stock/xom' 

    resp = requests.get(html_url) 
    if resp.status_code != 200: 
     raise Exception("http request failed: %s" % resp) 
    soup = BeautifulSoup(resp.text, 'lxml') 

    # get value of `data-uniqueid` from last news node of 'MarketWatch News on XOM' 
    li_node = soup.select("#mwheadlines > div.headlinewrapper > ol > li[data-uniqueid]")[-1] 
    unique_id = li_node['data-uniqueid'] 
    print('got unique_id=%r, from %r' % (unique_id, li_node.text.replace('\n', ' ').strip())) 


    baseUrl = 'http://www.marketwatch.com/news/headline/getheadlines' 
    parameters = { 
     'ticker':'XOM', 
     'countryCode':'US', 
     'docType':'806', 
     'docId': '', # (Optional) initial value extract from HTML page 
     'sequence':'e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2', # initial value extract from HTML page 
     'messageNumber':'8589', # initial value extract from HTML page 
     'count':'10', 
     'channelName': '/news/latest/company/us/xom', 
    } 

    parameters.update(extract_page_params(unique_id)) 


    while True: 
     resp = requests.get(baseUrl, params = parameters) 
     data = json.loads(resp.text) # array of size 10 
     first = data[0] # get first item of array 
     last = data[-1] # get last item of array 
     print("\ngot %d data, url: %s" % (len(data), resp.url)) 
     print("\tfirst: %-42s, %s" % (first['UniqueId'], first['SeoHeadlineFragment'])) 
     print("\t last: %-42s, %s" % (last['UniqueId'], last['SeoHeadlineFragment'])) 
     print("") 


     uid = last['UniqueId'] # get value of UniqueId from dict object `last` 

     parameters.update(extract_page_params(uid)) 

     input("press <enter> to get next") 


def extract_page_params(uid): 
    sequence = '' 
    messageNumber = '' 

    docId = '' 

    if ':' in uid: # if the symbol ':' in string `uid` 
     # uid looks like `e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2:8499` 
     # so split it by ':' 
     sequence, messageNumber = uid.split(':') 
    else: 
     docId = uid 

    return { 
     'sequence': sequence, 
     'messageNumber': messageNumber, 
     'docId': docId, 
    } 


if __name__ == '__main__': 
    main() 

這是我的代碼來解決您的問題。
由於您是編程新手,我添加了一些評論。
你可以直接複製並運行python版本3.(2也可以)

+0

好的,我的gmail是panyanyany ## gmail dot com。隨時聯繫我。 ;-) @farshidbalan –

0

您可以使用Beautiful Soup從html中提取數據。它是一個用於從HTML中提取數據的python庫。

+0

謝謝,我不知道如何用BeautifulSOup做到這一點。 –