2013-07-12 41 views
0

我需要使用網址的購物網站的標題和元標籤和圖像標籤。 這是我的代碼,它使用亞馬遜產品鏈接。但它不起作用的網址如:外部網站的標題和Meta標籤

  1. http://www.alternate.de/Synology/Synology+DS413,_NAS/html/product/1028780/
  2. http://www.bonprix.de/produkt/baby-fleecejacke-hellgrau-meliert-958416/

我爲獲得標籤代碼:

$url ="http://rads.stackoverflow.com/amzn/click/B009T9QCWI"; 
$ch = curl_init(); 
$timeout = 5; 
curl_setopt($ch, CURLOPT_URL, $url); 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
$data = curl_exec($ch); 
curl_close($ch); 
$returned_content = $data; 
$doc = new \DOMDocument(); 
@$doc->loadHTML($returned_content); 
$nodes = $doc->getElementsByTagName("title"); 
//$title = $nodes->item(0)->nodeValue; 
$product_title = str_replace("'", " ", $title); 
$xml=simplexml_import_dom($doc); 
$images=$xml->xpath("//img"); 
$j=0; 
foreach($images as $img) { 
    $host = explode(":",$img["src"]); 
    $ht = $host[0]; 
    if ($ht == "http" || $ht == "https") { 
     $info = pathinfo($img["src"]); 
     if (array_key_exists('extension', $info)) { 
      $extension = $info["extension"]; 
     } 
     if ($extension == "jpg" || $extension == "jpeg") { 
      $imagesrc[] = $img["src"]; 

      $j++; 

      $image[] = $img["src"] ; 

     } 
    } 
} 
$metas = $doc->getElementsByTagName('meta'); 
for ($i = 0; $i < $metas->length; $i++) { 
    $meta = $metas->item($i); 
    if ($meta->getAttribute('name') == 'description' || $meta->getAttribute('name') == 'Description') { 
     $description = $meta->getAttribute('content'); 
    } 
    if ($meta->getAttribute('name') == 'keywords') { 
     $keywords = $meta->getAttribute('content'); 
    } 
} 
if (empty($image)) { 
    $domarray[] = array('desc' => $description, 'title'=>$product_title); 
    print_r($domarray); 


} else { 
    $domarray[] = array('img' =>$image, 'desc' => $description, 'title'=>$product_title); 
    print_r($domarray) ; 

} 
+1

您試圖將HTML解析爲XML。請不要(http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags) 我認爲有一個API的亞馬遜,可以讓你的信息以更可解析的方式(https://affiliate-program.amazon.com/gp/advertising/api/detail/main.html)。 – ToBe

+0

@ToBe請注意,使用'loadHTML()'。完全可行 – hek2mgl

+0

謝謝你。但我的目標不僅是amazon.Amazon是這個代碼的作品。 –

回答

0

爲什麼不使用simplehtmldom解析器。

例子:

require_once 'simple_html_dom.php'; 

$url ="http://rads.stackoverflow.com/amzn/click/B009T9QCWI"; 

$html = file_get_html($url); 

// all results stored in this array 
$result = array(); 

// page title 
$result[ 'title' ] = $html->find('title', 0)->plaintext; 

// get all meta tags, which have an attribute "name" 
foreach($html->find('meta[name]') as $meta) { 
    $result[ 'meta' ][] = array(
     'name' => $meta->name, 
     'content' => $meta->content 
    ); 
} 

// get all images 
foreach($html->find('img') as $image) { 
    $result[ 'image' ][] = $image->src; 
} 

print_r($result); 

輸出

Array 
(
    [title] => Amazon.com: Samsung Galaxy S III, Black 16GB (Verizon Wireless): Cell Phones &amp; Accessories 
    [meta] => Array 
     (
      [0] => Array 
       (
        [name] => description 
        [content] => Shop cell phones and accessories at Amazon.com. You&#39;ll find great prices on cases, headsets, and the latest smartphones from carriers like Verizon, AT&amp;T, and Sprint 
       ) 

      [1] => Array 
       (
        [name] => title 
        [content] => Amazon.com: Samsung Galaxy S III, Black 16GB (Verizon Wireless): Cell Phones &amp; Accessories 
       ) 

      [2] => Array 
       (
        [name] => keywords 
        [content] => Samsung Galaxy S III, Black 16GB (Verizon Wireless),Samsung,Galaxy S III 
       ) 

     ) 

    [image] => Array 
     (
      [0] => http://g-ecx.images-amazon.com/images/G/01/gno/beacon/BeaconSprite-US-01._V397411194_.png 
      [1] => http://g-ecx.images-amazon.com/images/G/01/x-locale/common/transparent-pixel._V386942464_.gif 
      [2] => http://g-ecx.images-amazon.com/images/G/01/x-locale/common/transparent-pixel._V386942464_.gif 
      [3] => http://ecx.images-amazon.com/images/I/41%2Bh%2BUmrcRL._SY300_.jpg 
      [4] => http://ecx.images-amazon.com/images/I/41%2Bh%2BUmrcRL._SL500_AA280_.jpg 
      [5] => http://g-ecx.images-amazon.com/images/G/01/icons/icon-offsite-sl-7069-t4._V171196157_.png 
      [6] => http://g-ecx.images-amazon.com/images/G/01/icons/icon-offsite-sl-7069-t4._V171196157_.png 
      [7] => http://ecx.images-amazon.com/images/I/41FBSaIC4AL._SL500_SS100_.jpg 
      [8] => http://ecx.images-amazon.com/images/I/41HGvd6-jwL._SL500_SS100_.jpg 
      [9] => http://ecx.images-amazon.com/images/I/51jiU%2BiYWUL._SL500_SS100_.jpg 
      [10] => http://ecx.images-amazon.com/images/I/317JogSYmkL._SL500_SS100_.jpg 
      [11] => http://ecx.images-amazon.com/images/I/41d6B11BDuL._SL500_SS100_.jpg 
      [12] => http://ecx.images-amazon.com/images/I/41a94BWHXbL._SL500_SS100_.jpg 
      [13] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/B009T9QCWI.main_SM.jpg 
      [14] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/B009T9QCWI.pt01_SM.jpg 
      [15] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/wireless-box-logo-verizon-box.jpg 
      [16] => http://g-ecx.images-amazon.com/images/G/01/th/aplus/a-plus_bottom-217._V180545591_.gif 
      [17] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/B009T9QCWI.pt02_SM.jpg 
      [18] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/amazon_app_suite_1_sma.jpg 
      [19] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/amazon_app_suite_5_sm.jpg 
      [20] => http://ecx.images-amazon.com/images/I/41HGvd6-jwL._SL75_SS50_.jpg 
      [21] => http://ecx.images-amazon.com/images/I/41FBSaIC4AL._SL75_SS50_.jpg 
      [22] => http://ecx.images-amazon.com/images/I/51jiU%2BiYWUL._SL75_SS50_.jpg 
      [23] => http://ecx.images-amazon.com/images/I/41a94BWHXbL._SL75_SS50_.jpg 
      [24] => http://g-ecx.images-amazon.com/images/G/01/x-locale/communities/reputation/suggestionbox._V192249929_.gif 
      [25] => http://g-ecx.images-amazon.com/images/G/01/icons/orange-arrow._V192570247_.gif 
      [26] => http://g-ecx.images-amazon.com/images/G/01/icons/orange-arrow._V192570247_.gif 
      [27] => http://g-ecx.images-amazon.com/images/G/01/icons/orange-arrow._V192570247_.gif 
      [28] => http://g-ecx.images-amazon.com/images/G/01/gno/images/general/navAmazonLogoFooter._V169459313_.gif 
      [29] => /gp/uedata/unsticky/182-7026578-6696341//ntpoffrw?noscript&amp;id=158FKQCX6TYATFBQQW0V 
     ) 

) 

您可以通過在一個循環中的URL和做都一樣。爲了簡單起見,我已經對圖像和元標記進行了檢查。 希望它有幫助。

+0

謝謝你。但它不適用於像http://www.alternate.de/Synology/Synology+DS413,_NAS/html/product/1028780/的網址? http://www.bonprix.de/produkt/baby-fleecejacke-hellgrau-meliert-958416/ –

+0

嗯..我會檢查。 –