2016-04-01 35 views
2

我試圖在網絡爬蟲類中使用簡單的HTML Dom時出現以下錯誤。該類似乎運行良好,但在error_log文件中出現很多錯誤。錯誤PHP網站爬蟲類使用簡單的HTML Dom

[01-Apr-2016 23:16:51 UTC] PHP Warning: Invalid argument supplied for foreach() in /home/scrybs/public_html/order/uploader/php/simple_html_dom.php on line 357 

如果我檢查簡單的HTML DOM,錯誤來自這裏:

function innertext() 
    { 
     if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 
     if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 

     $ret = ''; 
     foreach ($this->nodes as $n) 
      $ret .= $n->outertext(); 
     return $ret; 
    } 

有問題的爬蟲類是如下:

class crawler 
{ 
    protected $_url; 
    protected $_depth; 
    protected $_host; 
    protected $_useHttpAuth = false; 
    protected $_user; 
    protected $_pass; 
    protected $_seen = array(); 
    protected $_filter = array(); 
    public $contenu = array(); 

    public function __construct($url, $depth = 5) 
    { 
     $this->_url = $url; 
     $this->_depth = $depth; 
     $parse = parse_url($url); 
     $this->_host = $parse['host']; 
     $this->html = new simple_html_dom(); 
    } 

    protected function _processAnchors($content, $url, $depth) 
    { 
     //$dom = new DOMDocument('1.0'); 
     //@$dom->loadHTML($content, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); 
     //$dom->formatOutput = true; 
     $this->html->load($content); 

     $metatitle = $this->html->find('title',0)->innertext; 
     foreach($this->html->find("meta[name='description']") as $element){ 
      $metadescription = $element->content; 
     } 
     foreach($this->html->find("meta[name='keywords']") as $element){ 
      $metakeywords = $element->content; 
     } 

     if(!empty($metatitle)){       
      $this->contenu['meta_titles'][] = $metatitle; 
     } 
     if(!empty($metadescription)){ 
      $this->contenu['meta_titles'][] = $metadescription; 
     } 
     if(!empty($metakeywords)){ 
      $this->contenu['meta_titles'][] = $metakeywords; 
     } 

     // IMAGE ALTS 
     foreach($this->html->find('img') as $e){ 
      if(!empty($e->alt)){ 
       if(!$this->search_array($e->alt, $this->contenu)){ 
        $this->contenu['alt_images'][] = $e->alt; 
       } 
      } 
     } 

     // LINKS 
     $links = $this->html->find('a'); 
     foreach($links as $element){ 
      // GET LINK TEXTS 
      $a = $element->innertext; 
      $a = preg_replace("/<a.*?>(.*?)<\/a>/", '\1', $a); 
      $a = preg_replace("/<p.*?>.*?<\/p>/", "{{P}}", $a); 
      $a = preg_replace("/<img.*?>/", "{{IMG}}", $a); 
      $a = preg_replace('#(<br */?>\s*)+#i', "{{BR}}", $a); 
      $a = preg_replace('#<button.*?>.*?</button>#i', '{{BUTTON}}', $a); 
      $a = preg_replace('#<time.*?>(.*?)</time>#i', '{{TIME}}', $a); 
      $a = preg_replace('#<span.*?>(.*?)</span>#i', '{{SPAN}}\1{{/SPAN}}', $a); 
      $a = preg_replace('#<strong.*?>(.*?)</strong>#i', '{{STRONG}}\1{{/STRONG}}', $a); 
      $a = preg_replace('#<b.*?>(.*?)</b>#i', '{{B}}\1{{/B}}', $a); 
      $a = preg_replace('#<i.*?>(.*?)</i>#i', '{{I}}\1{{/I}}', $a); 
      $a = preg_replace('#<small.*?>(.*?)</small>#i', '{{SMALL}}\1{{/SMALL}}', $a); 
      $a = preg_replace('#<abbr.*?>(.*?)</abbr>#i', '{{ABBR}}\1{{/ABBR}}', $a); 
      $a = trim(strip_tags($a)); 
      $a = preg_replace('/\s+/', ' ', $a); 
       // CHECK IF NOT ONLY VARIABLES AND SPACES 
       $atmp = strip_tags($a); 
       $atmp = preg_replace("/{{.*?}}/", '', $atmp); 
       $atmp = preg_replace('/\s+/', '', $atmp); 
      if(!empty($a) && $a != '' && $atmp != ''){ 
       if(!$this->search_array($a, $this->contenu)){ 
        $this->contenu['link_texts'][] = $a; 
       } 
      } 

      // GET LINK TITLES 
      $title = $element->title; 
      if(!empty($title)){ 
       if(!$this->search_array($title, $this->contenu)){ 
        $this->contenu['link_titles'][] = $title; 
       } 
      } 

      $href = $element->href; 
       if (0 !== strpos($href, 'http')) { 
        $path = '/' . ltrim($href, '/'); 
        if (extension_loaded('http')) { 
         $href = http_build_url($url, array('path' => $path)); 
        } else { 
         $parts = parse_url($url); 
         $href = $parts['scheme'] . '://'; 
         if (isset($parts['user']) && isset($parts['pass'])) { 
          $href .= $parts['user'] . ':' . $parts['pass'] . '@'; 
         } 
         $href .= $parts['host']; 
         if (isset($parts['port'])) { 
          $href .= ':' . $parts['port']; 
         } 
         $href .= $path; 
        } 
       } 
      // Crawl only link that belongs to the start domain 
      $this->crawl_page($href, $depth - 1); 
     } 
     return $this->contenu; 
    } 

    protected function _getContent($url) 
    { 
     $handle = curl_init($url); 
     if ($this->_useHttpAuth) { 
      curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); 
      curl_setopt($handle, CURLOPT_USERPWD, $this->_user . ":" . $this->_pass); 
     } 
     // follows 302 redirect, creates problem wiht authentication 
//  curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE); 
     // return the content 
     curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE); 

     /* Get the HTML or whatever is linked in $url. */ 
     $response = curl_exec($handle); 
     // response total time 
     $time = curl_getinfo($handle, CURLINFO_TOTAL_TIME); 
     /* Check for 404 (file not found). */ 
     $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE); 

     curl_close($handle); 
     return array($response, $httpCode, $time); 
    } 

    protected function _printResult($url, $depth, $httpcode, $time) 
    { 
     ob_end_flush(); 
     $currentDepth = $this->_depth - $depth; 
     $count = count($this->_seen); 
     //echo "N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>"; 
     ob_start(); 
     flush(); 
    } 

    protected function isValid($url, $depth) 
    { 
     if (strpos($url, $this->_host) === false 
      || $depth === 0 
      || isset($this->_seen[$url]) 
      || preg_match("/#/i", $url) 
      || preg_match("/.png/i", $url) 
      || preg_match("/.jpg/i", $url) 
      || preg_match("/.jpeg/i", $url) 
      || preg_match("/.gif/i", $url) 
      || preg_match("/.pdf/i", $url) 
      || preg_match("/javascript/i", $url) 
      || preg_match("/twitter.com/i", $url) 
      || preg_match("/google.com/i", $url) 
      || preg_match("/facebook.com/i", $url) 
      || preg_match("/youtube.com/i", $url) 
      || preg_match("/instagram.com/i", $url) 
      || preg_match("/wp-login.php/i", $url) 
     ) { 
      return false; 
     } 
     foreach ($this->_filter as $excludePath) { 
      if (strpos($url, $excludePath) !== false) { 
       return false; 
      } 
     } 
     return true; 
    } 

    public function search_array($needle, $haystack) { 
     if(in_array($needle, $haystack)) { 
       return true; 
     } 
     foreach($haystack as $element) { 
       if(is_array($element) && $this->search_array($needle, $element)) 
        return true; 
     } 
     return false; 
    } 

    public function crawl_page($url, $depth) 
    { 
     if (!$this->isValid($url, $depth)) { 
      return; 
     } 
     // add to the seen URL 
     $this->_seen[$url] = true; 
     // get Content and Return Code 
     list($content, $httpcode, $time) = $this->_getContent($url); 
     // print Result for current Page 
     //$this->_printResult($url, $depth, $httpcode, $time); 
     // process subPages 
     $this->_processAnchors($content, $url, $depth, $contenu = array()); 
    } 

    public function addFilterPath($path) 
    { 
     $this->_filter[] = $path; 
    } 

    public function run() 
    { 
     $this->crawl_page($this->_url, $this->_depth); 
    } 
} 

錯誤似乎來自何處這條線與無紋文字相關:

// GET LINK TEXTS 
$a = $element->innertext; 

我沒有得到任何錯誤,當我使用:

$a = $element->innertext; 

但並不理想,因爲我想保留HTML標籤。當我在類的外面使用Simple HTML Dom時,我沒有遇到任何錯誤,所以它與Simple HTML Dom在類中的事實有關嗎?有人有想法嗎?

感謝您的幫助!

+1

你能提供處理過的網址嗎?另外,我看到你有DOMDocument行註釋:你也嘗試過使用DOM? – fusion3k

+0

@ fusion3k我嘗試過使用DOMDocument,但是我發現獲得我想要的結果更復雜,因爲它需要一些技巧來獲取HTML標記的內部文本...... – scrybs

回答

2

我發現了這個錯誤。

在我的(有限)測試中,當您設置深度> 1時,問題發生,因此 - 看到您的代碼 - 加載多個頁面URL時。無數的簡單的HTML DOM的問題,是->load()方法不能在多個負載上正常工作。

重新實例化對象html,劇本似乎工作:

protected function _processAnchors($content, $url, $depth) 
{ 
    $this->html = new simple_html_dom();         # <----- 
    $this->html->load($content); 

我也$this->html = str_get_html($content);測試,但它只能在有限的網站。

其他注意事項:在HTML <title>標記是強制​​性的,但並非所有的網站都有格式良好的HTML:考慮檢查<title>標記(以及每個標記)的存在以避免其他錯誤。

+0

像魅力一樣工作!帽子掉了!非常感謝。 – scrybs