2014-04-09 49 views
0

我可以使用這個腳本成功刮上this page所有項目:屏幕抓取在PHP中的JavaScript

$html = file_get_contents($list_url); 
$doc = new DOMDocument(); 
libxml_use_internal_errors(TRUE); 

    if(!empty($html)) 
    { 
     $doc->loadHTML($html); 
     libxml_clear_errors(); // remove errors for yucky html 
     $xpath = new DOMXPath($doc); 

     /* FIND LINK TO PRODUCT PAGE */ 

     $products = array(); 

     $row = $xpath->query($product_location); 

     /* Create an array containing products */ 
     if ($row->length > 0) 
     {    
      foreach ($row as $location) 
      { 
       $product_urls[] = $product_url_root . $location->getAttribute('href'); 
      } 
     } 
     else { echo "product location is wrong<br>";} 

     $imgs = $xpath->query($photo_location); 

     /* Create an array containing the image links */ 
     if ($imgs->length > 0) 
     {    
      foreach ($imgs as $img) 
      { 
       $photo_url[] = $photo_url_root . $img->getAttribute('src'); 
      } 
     } 
     else { echo "photo location is wrong<br>";} 

     $was = $xpath->query($was_price_location); 

     /* Create an array containing the was price */ 
     if ($was->length > 0) 
     { 
      foreach ($was as $price) 
      { 
       $stripped = preg_replace("/[^0-9,.]/", "", $price->nodeValue); 
       $was_price[] = "&pound;".$stripped; 
      } 
     } 
     else { echo "was price location is wrong<br>";} 

     $now = $xpath->query($now_price_location); 

     /* Create an array containing the sale price */ 
     if ($now->length > 0) 
     { 
      foreach ($now as $price) 
      { 
       $stripped = preg_replace("/[^0-9,.]/", "", $price->nodeValue); 
       $stripped = number_format((float)$stripped, 2, '.', ''); 
       $now_price[] = "&pound;".$stripped; 
      } 
     } 
     else { echo "now price location is wrong<br>";} 

     $result = array(); 

     /* Create an associative array containing all the above values */ 
     foreach ($product_urls as $i => $product_url) 
     { 
      $result[] = array(
       'product_url' => $product_url, 
       'shop_name' => $shop_name, 
       'photo_url' => $photo_url[$i], 
       'was_price' => $was_price[$i], 
       'now_price' => $now_price[$i] 
      ); 
     } 
    } 

然而,一個問題出現了,如果我要得到兩個頁面,或者如果我認爲100%file_get_contents($list_url)頁將始終以24個值返回頁面1。

我相信,該頁面的變化正在通過AJAX請求處理(雖然我無法找到源的這個任何證據)。有什麼方法可以精確地刮擦我在屏幕上看到的內容嗎?

我已經看到了以前的答案PhantomJS的談話,但我不知道它會在這裏給我在PHP我的工作是合適的。

回答

0
// Create DOM from URL or file 
$file= file_get_html('http://stackoverflow.com/'); 

// Find your links 
foreach($file->find('a') as $youreEement) { 
     echo $yourElement->href . '<br>'; 
}