2014-03-03 219 views
2

我試圖從使用PHP的網站抓取圖像。從網站抓取圖片

我試圖抓取的頁面是: http://www.reebonz.com.sg/event/t7349#/event/t7349

但是用我的代碼我只得到我的頭的href。我的代碼是:

<?php 

require_once ('function.php'); 

$advt_id = "88477"; 

$programurl = "http://www.reebonz.com.sg/event_list/1/"; 

$baseurl = "http://www.reebonz.com.sg/event_list/1/"; 

$crawl_data []= array ("department" => 0, "category" => "bags" , "advt_cat" => "BALENCIAGA", "cat_url" => 'http://www.reebonz.com.sg/event/t7349#/event/t7349'); 


    $data = get_data($url); 

    $product_raw = splice_data ($data, 'ul class="rec-items-ul ng-scope"',1, '</ul>',1); 

    $product_list = splice_list ($product_raw, 'href="', '"'); 

    echo "\n**** Got Product List ".count($product_list)." ***\n"; 

     print_r ($product_list); 

      foreach ($product_list as $product) 
       { 

        if ((strlen($product) < 10)) 
         { 
         echo $product; 
         continue; 
         } 

        $url = $baseurl.$product; 

        $data = get_data($url); 


        $img_data = splice_data ($data, 'class="rbz_product-zoom-image row"', 1, '</div>', 1); 


        $img_url = splice_data ($img_data, 'href="',1, '"', 1); 



        echo $img_url; 

        $filePath = $crawl_cat['category']."\\".$crawl_cat['advt_cat']; 
        if (!file_exists($filePath)) { 
         mkdir($filePath, 0777, true); 
        } 
        grab_image($img_url,$filePath); 
        //grab_image($img_url5,$filePath); 

        echo "*"; 


       }// end of product insert for 

?> 

的function.php是:

function splice_data ($data, $startstr, $startoccur, $endstr, $endoccur) 
{ 

    if ($startoccur > 1) 
    { 
    for ($i = 1, $startpos = 1 ; $startoccur >= $i; $i++, $startpos++) 
     { 
      $startpos = stripos($data,$startstr,$startpos); 
      //echo $startpos. "\n"; 

     } 
     $start = $startpos; 
    } 

    else 
    $start = stripos($data,$startstr,$startoccur); 

    $start_index = strlen($startstr); 



    $end = stripos($data,$endstr,$start + $start_index) ; 

    $splice_data = substr($data,$start + $start_index, $end - ($start + $start_index)); 

    return $splice_data; 

} 


function splice_list ($img_data, $start_str, $end_str, $find = '', $replace = '') 
{ 

    for ($i = 1, $j = 1; stripos($img_data,$start_str,$i) > 1 ;) 
    { 

    $start = stripos($img_data,$start_str,$i); 

    $start_len = strlen($start_str); 

    $end = stripos($img_data,$end_str,$start + $start_len) ; 

    $data_list[] = str_replace($find,$replace,substr($img_data, $start + $start_len , $end - $start - $start_len)) ; 

    $i = $end; 

    $j++; 

    } 

    $result = array_unique($data_list); 

    return $result; 
} 


function get_data($url, $ckfile="", $cookie="") 
{ 

$toCheckURL = $url; 
// This all sets up the CURL actions to check the page 

$header=array(
// 'User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12', 
    'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
    'Accept-Language: en-us,en;q=0.5', 
    'Accept-Encoding: gzip,deflate', 
    'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7', 
    'Keep-Alive: 115', 
    'Connection: keep-alive', 
); 


$proxies = array(); 

$ch = curl_init(); 


if (isset($proxy)) { // If the $proxy variable is set, then 
    curl_setopt($ch, CURLOPT_PROXY, $proxy); // Set CURLOPT_PROXY with proxy in $proxy variable 
} 



curl_setopt($ch, CURLOPT_URL, $toCheckURL); 
curl_setopt($ch, CURLOPT_HEADER, true); 
curl_setopt($ch, CURLOPT_NOBODY, false); 

if (isset($ckfile) && $ckfile !="" and !empty($ckfile)) 
{ 
    curl_setopt ($ch, CURLOPT_COOKIEFILE, $ckfile); 
} 

curl_setopt($ch,CURLOPT_SSL_VERIFYPEER, false); 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); 
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,60); 
curl_setopt($ch, CURLOPT_TIMEOUT,90); 
curl_setopt($ch, CURLOPT_MAXREDIRS, 10); //follow up to 10 redirections - avoids loops 
if($cookie != "") 
    curl_setopt($ch,CURLOPT_HTTPHEADER, array($cookie)); 

curl_setopt($ch,CURLOPT_USERAGENT,$agents[array_rand($agents)]); 


$data = curl_exec($ch); 

curl_close($ch); 

return $data; 

} 

,我現在得到的輸出是:

**** Got Product List 8 *** Array ([0] => //netdna.bootstrapcdn.com/twitter- 
bootstrap/2.3.2/css/bootstrap-combined.no-icons.min.css [1] => //netdna.bootstrapcdn.com/font- 
awesome/3.2.1/css/font-awesome.css [2] => 
http://www.reebonz.com.sg/sites/all/themes/custom/octopus2/xfavicon.ico.pagespeed.ic.jT8Y7LgYBc.png 
[3] => http://www.octopus2.local/sites/all/themes/custom/octopus2/css/reebonz-ie.css [4] => 
http://www.reebonz.com.sg/sites/all/modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.core.min.css,qn1a78z+modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.theme.min.css,qn1a78z+modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.slider.min.css,qn1a78z+modules,_contrib,_panels,_css,_panels.css,qn1a78z+modules,_custom,_mailcheck,_css,_mailcheck.css,qn1a78z+themes,_custom,_octopus2,_css,_bootstrap.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-core.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-social-network.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-.... 

什麼是錯我的代碼有什麼簡單如何做到這一點?

+0

沒有看到'get_data()',不能說。 – Daedalus

+0

我已經添加了代碼get_data() – user1583647

回答

0

使用PHP的DomDocument:

$doc = new DOMDocument(); 
$doc->loadHTML(your_html_code); 
$images = $doc->getElementsByTagName('img'); 
foreach ($images as $img) { 
    //do whatever you like 
} 
+0

嘗試過它...不能 – user1583647

+0

使用此:http://simplehtmldom.sourceforge.net/ – poostchi

+0

也嘗試過這一個,但不是要獲取圖像 – user1583647

0

下載這個庫:http://sourceforge.net/projects/simplehtmldom/

及以下的代碼將工作

(包括頂部該庫)

<?php 
error_reporting(1); 
include_once('simple_html_dom.php'); 
$html = new simple_html_dom(); 

$html->load_file('https://www.google.co.in/search?q=shahrukh+khan&newwindow=1&biw=1375&bih=791&source=lnms&tbm=isch&sa=X&sqi=2&ved=0ahUKEwi1rO6AjZrKAhWSBY4KHWSGBDQQ_AUIBygC'); 
$reviews = $html->find('img'); 

$fetched_images = ''; 
foreach($reviews as $link) 
{ 
    //find review ID if not null 
    if($link->{'src'} != ''){ 
     $review_ID = $link->{'src'}; 
     $fetched_images[] = $review_ID; 
    } 
} 
?> 
<ul> 
    <?php foreach ($fetched_images as $fetched_image): ?> 
    <li style="display:inline-block"><img src="<?php echo $fetched_image;?>"></li> 
    <?php endforeach ?> 

</ul> 
0
<?php 
    include_once('simple_html_dom.php'); 
    $target_url = "Your URL here"; 
    $html = new simple_html_dom(); 
    $html->load_file($target_url); 
    $images = $html->find('img'); 
    /**foreach($images as $link){ 
     //find review ID if not null 
     if($link->{'src'} != ''){ 
      $image_ID = $link->{'src'}; 
      $fetched_images[] = $image_ID; 
     } 
    }*/ 
    foreach ($images as $fetched_image){ 
     echo $fetched_image; 
     } 

?> 
+1

請您可以添加一些評論給你的答案,以澄清它在做什麼以及如何? – Gareth