2015-06-19 59 views
0

我試圖抓取此網頁:SiriusXMU以獲取「正在播放」的信息。下面是到目前爲止,我已經得到了代碼:爲特定網頁檢索定義特定的PHP捲曲選項

$timeout = 60; 
    $url = 'http://www.siriusxm.com/siriusxmu'; 
    $agent= 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'; 
    $referer = 'http://www.siriusxm.com/channellineup/'; 

    $header[] = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; 
    $header[] = "Cache-Control: max-age=0"; 
    $header[] = "Connection: keep-alive"; 
    //$header[] = "Keep-Alive: 300"; 
    //$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; 
    $header[] = "Accept-Language: en-US,en;q=0.5"; 

    $ch = curl_init(); 
    curl_setopt($ch, CURLOPT_URL, $url);//The URL to fetch. This can also be set when initializing a session with curl_init(). 
    curl_setopt($ch, CURLOPT_USERAGENT, $agent);//The contents of the "User-Agent: " header to be used in a HTTP request. 
    curl_setopt($ch, CURLOPT_HTTPHEADER, $header);//An array of HTTP header fields to set, in the format array('Content-type: text/plain', 'Content-length: 100') 
    curl_setopt($ch, CURLOPT_HEADER, true);//TRUE to include the header in the output. 
    curl_setopt($ch, CURLOPT_REFERER, $referer);//The contents of the "Referer: " header to be used in a HTTP request. 
    curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');//The contents of the "Accept-Encoding: " header. This enables decoding of the response. Supported encodings are "identity", "deflate", and "gzip". If an empty string, "", is set, a header containing all supported encoding types is sent. 
    //curl_setopt($ch, CURLOPT_AUTOREFERER, true);//TRUE to automatically set the Referer: field in requests where it follows a Location: redirect. 
    //curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);//TRUE to follow any "Location: " header that the server sends as part of the HTTP header (note this is recursive, PHP will follow as many "Location: " headers that it is sent, unless CURLOPT_MAXREDIRS is set). 
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);//The maximum number of seconds to allow cURL functions to execute. 
    //curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);//FALSE to stop cURL from verifying the peer's certificate. Alternate certificates to verify against can be specified with the CURLOPT_CAINFO option or a certificate directory can be specified with the CURLOPT_CAPATH option. 
    //curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);1 to check the existence of a common name in the SSL peer certificate. 2 to check the existence of a common name and also verify that it matches the hostname provided. In production environments the value of this option should be kept at 2 (default value). 
    //curl_setopt($ch, CURLOPT_VERBOSE, true);//TRUE to output verbose information. Writes output to STDERR, or the file specified using CURLOPT_STDERR. 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);//if the CURLOPT_RETURNTRANSFER option is set, it will return the result on success, FALSE on failure. 
    //  
    $result = curl_exec($ch);//Returns TRUE on success or FALSE on failure. However, if the CURLOPT_RETURNTRANSFER option is set, it will return the result on success, FALSE on failure. 
    curl_close($ch); 

我一直在學習,我的瀏覽器將其成功地使網頁的「上航」節這說明什麼現在打的HTTP標頭。但是,當我使用curl模擬這些標題時,網頁的「One the Air」部分將返回「對不起,程序信息不適用於所選平臺」。

的Firefox附加元件的HttpFox顯示主頁如下:

00:00:03.904 0.163 1524 209 GET 200 text/html http://www.siriusxm.com/siriusxmu 

(Request-Line) GET /siriusxmu HTTP/1.1 
Host www.siriusxm.com 
User-Agent Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0 
Accept text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 
Accept-Language en-US,en;q=0.5 
Accept-Encoding gzip, deflate 
Referer http://www.siriusxm.com/channellineup/ 
Cookie mmcore.tst=0.557; mmid=-318486443%7CBQAAAAo2JYEzEgwAAA%3D%3D; mmcore.pd=111492824%7CBQAAAAoBQjYlgTMSDPt9EvUCAJ3zFneyeNJIDwAAAIQ4RsgceNJIAAAAAP//////////ABB3d3cuc2lyaXVzeG0uY29tAhIMAgAAAAAAAAAAAAD///////////////8AAAAAAAFF; mmcore.srv=cg5.usw; __utma=1.1327546933.1434659528.1434659528.1434723665.2; __utmz=1.1434659528.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); s_nr=1434723821271-Repeat; s_vnum=1435723200051%26vn%3D2; s_lastvisit=1434723660883; s_vi=[CS]v1|2AC1956485078C76-6000010E20030C67[CE]; mm_pc=%7B%22vehiclenewness%22%3A%22new%22%2C%22PC2%22%3A%22%22%7D; sxm_platform=xm; __utmv=1.|5=serviceType=xm=1; _hjUserId=86ab277e-6c63-4dd1-975c-3424e32502e6; __insp_slim=1434659556045; __insp_wid=800165747; __insp_nv=true; __insp_ref=aHR0cDovL3d3dy5zaXJpdXN4bS5jb20vc3RyZWFtaW5n; __insp_norec_sess=true; _hjIncludedInSample=1; __utmc=1; s_cc=true; SC_LINKS=%5B%5BB%5D%5D; s_sq=%5B%5BB%5D%5D; s_sv_sid=797366592635; QSI_HistorySession=http%3A%2F%2Fwww.siriusxm.com%2Fstreaming~1434659533837%7Chttp%3A%2F%2Fwww.siriusxm.com%2Fchannellineup%2F%23~1434659556190%7Chttp%3A%2F%2Fwww.siriusxm.com%2Fsiriusxmu~1434659575429; s_invisit=true; __utmb=1.8.10.1434723665 
Connection keep-alive 

,並請求對「一航」部分的JavaScript時,以下幾點:

00:00:05.293 1.186 1609 (137) GET 304 text/javascript http://www.siriusxm.com/static/app/js/sxm-channel-ontheair.js 

(Request-Line) GET /static/app/js/sxm-channel-ontheair.js HTTP/1.1 
Host www.siriusxm.com 
User-Agent Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0 
Accept */* 
Accept-Language en-US,en;q=0.5 
Accept-Encoding gzip, deflate 
Referer http://www.siriusxm.com/siriusxmu 
Cookie mmcore.tst=0.557; mmid=-318486443%7CBQAAAAo2JYEzEgwAAA%3D%3D; mmcore.pd=111492824%7CBQAAAAoBQjYlgTMSDPt9EvUCAJ3zFneyeNJIDwAAAIQ4RsgceNJIAAAAAP//////////ABB3d3cuc2lyaXVzeG0uY29tAhIMAgAAAAAAAAAAAAD///////////////8AAAAAAAFF; mmcore.srv=cg5.usw; __utma=1.1327546933.1434659528.1434659528.1434723665.2; __utmz=1.1434659528.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); s_nr=1434723821271-Repeat; s_vnum=1435723200051%26vn%3D2; s_lastvisit=1434723660883; s_vi=[CS]v1|2AC1956485078C76-6000010E20030C67[CE]; mm_pc=%7B%22vehiclenewness%22%3A%22new%22%2C%22PC2%22%3A%22%22%7D; sxm_platform=xm; __utmv=1.|5=serviceType=xm=1; _hjUserId=86ab277e-6c63-4dd1-975c-3424e32502e6; __insp_slim=1434659556045; __insp_wid=800165747; __insp_nv=true; __insp_ref=aHR0cDovL3d3dy5zaXJpdXN4bS5jb20vc3RyZWFtaW5n; __insp_norec_sess=true; _hjIncludedInSample=1; __utmc=1; s_cc=true; SC_LINKS=%5B%5BB%5D%5D; s_sq=%5B%5BB%5D%5D; s_sv_sid=797366592635; QSI_HistorySession=http%3A%2F%2Fwww.siriusxm.com%2Fstreaming~1434659533837%7Chttp%3A%2F%2Fwww.siriusxm.com%2Fchannellineup%2F%23~1434659556190%7Chttp%3A%2F%2Fwww.siriusxm.com%2Fsiriusxmu~1434659575429; s_invisit=true; __utmb=1.8.10.1434723665 
Connection keep-alive 
If-Modified-Since Fri, 22 May 2015 02:06:57 GMT 
If-None-Match "ab841364-8501-516a21d70499b" 
Cache-Control max-age=0 

Web服務器是確定對我的curl請求無效,並且未啓用「On the Air」javascript內容,只是說「對不起,程序信息不適用於所選平臺」。

如何讓curl正常工作並模擬我的瀏覽器,從而從此Web服務器返回有效的網頁結果?

回答

2

看來您需要運行一個具有JavaScript解釋器的客戶端。

的HTML包括以下內容:

<div id="on-the-air-unavailable"><p>Sorry, program information is not available for the selected platform.</p></div> 

的JS包括以下(不在一起):

$("#on-the-air-unavailable").hide(); 
$("#on-the-air-unavailable").show(); 

要使JavaScript和你將需要運行在一起的HTML互動。

有一些無頭HTTP客戶端可以使用JS解釋器或像Selenium這樣的瀏覽器自動化工具。

+0

請建議一些有JS解釋器的無頭HTTP客戶端:看起來SimpleTest的PHP腳本化Web瀏覽器(http://www.simpletest.org/en/browser_documentation.html)不包含JS解釋器。編輯:我發現一些在下面的答案:http://stackoverflow.com/a/814929/5006730 – BartmanEH