2013-12-22 68 views
-1

我有以下代碼:
獲取內部HTML - PHP

$data = file_get_contents('http://www.robotevents.com/robot-competitions/vex-robotics-competition?limit=all'); 
echo "Downloaded"; 
$dom = new domDocument; 

@$dom->loadHTML($data); 
$dom->preserveWhiteSpace = false; 
$tables = $dom->getElementsByTagName('table'); 

$rows = $tables->item(2)->getElementsByTagName('tr'); 

foreach ($rows as $row) { 
    $cols = $row->getElementsByTagName('td'); 
for ($i = 0; $i < $cols->length; $i++) { 
    echo $cols->item($i)->nodeValue . "\n"; 
} 

} 

最後一個字段有一個鏈接,我需要存儲的URL。另外,該腳本輸出諸如「Â」之類的字符。有誰知道如何解決這些問題?

回答

-1

我會建議不使用DOM來解析HTML,因爲它有無效的HTML問題。而使用正則表達式

我使用這個類:

<?php 

    /** 
    * Class to return HTML elements from a HTML document 
    * @version 0.3.1 
    */ 
    class HTMLQuery 
    { 

     protected $selfClosingTags = array('area', 'base', 'br', 'hr', 'img', 'input', 'link', 'meta', 'param'); 
     private $html; 

     function __construct($html = false) 
     { 
      if($html !== false) 
       $this->load($html); 
     } 

     /** 
     * Load a HTML string 
     */ 
     public function load($html) 
     { 
      $this->html = $html; 
     } 

     /** 
     * Returns elements from the HTML 
     */ 
     public function getElements($element, $attribute_match = false, $value_match = false) 
     { 
      if(in_array($element, $this->selfClosingTags)) 
       preg_match_all("/<$element *(.*)*\/>/isU", $this->html, $matches); 
      else 
       preg_match_all("/<$element(.*)>(.*)<\/$element>/isU", $this->html, $matches); 

      if($matches) 
      { 
       #Create an array of matched elements with attributes and content 
       foreach($matches[0] as $key => $el) 
       { 
        $current_el = array('name' => $element); 
        $attributes = $this->parseAttributes($matches[1][$key]); 
        if($attributes) 
         $current_el['attributes'] = $attributes; 
        if($matches[2][$key]) 
         $current_el['content'] = $matches[2][$key]; 

        $elements[] = $current_el; 
       } 

       #Return only elements with a specific attribute and or value if specified 
       if($attribute_match != false && $elements) 
       { 
        foreach($elements as $el_key => $current_el) 
        { 
         if($current_el['attributes']) 
         { 
          foreach($current_el['attributes'] as $att_name => $att_value) 
          { 
           $keep = false; 
           if($att_name == $attribute_match) 
           { 
            $keep = true; 
            if($value_match == false) 
             break; 
           } 
           if($value_match && ($att_value == $value_match)) 
           { 
            $keep = true; 
            break; 
           } 
           elseif($value_match && ($att_value != $value_match)) 
            $keep = false; 
          } 
          if($keep == false) 
           unset($elements[$el_key]); 
         } 
         else 
          unset($elements[$el_key]); 
        } 
       } 

      } 

      if($elements) 
       return array_values($elements); 
      else 
       return array(); 
     } 

     /** 
     * Return an associateive array of all the form inputs 
     */ 
     public function getFormValues() 
     { 
      $inputs = $this->getElements('input'); 
      $textareas = $this->getElements('textarea'); 
      $buttons = $this->getElements('button'); 
      $elements = array_merge($inputs, $textareas, $buttons); 
      if($elements) 
      { 
       foreach($elements as $current_el) 
       { 
        $attribute_name = mb_strtolower($current_el['attributes']['name']); 

        if(in_array($current_el['name'], array('input', 'button'))) 
        { 
         if(isset($current_el['attributes']['name']) && isset($current_el['attributes']['value'])) 
          $form_values[$attribute_name] = $current_el['attributes']['value']; 
        } 
        else 
        { 
         if(isset($current_el['attributes']['name']) && isset($current_el['content'])) 
          $form_values[$attribute_name] = $current_el['content']; 
        } 
       } 
      } 

      return $form_values; 
     } 

     /** 
     * Parses attributes into an array 
     */ 
     private function parseAttributes($str) 
     { 
      $str = trim(rtrim(trim($str), '/')); 
      if($str) 
      { 
       preg_match_all("/([^ =]+)\s*=\s*[\"'「」]{0,1}([^\"'「」]*)[\"'「」]{0,1}/i", $str, $matches); 
       if($matches[1]) 
       { 
        foreach($matches[1] as $key => $att) 
        { 
         $attribute_name = mb_strtolower($att); 
         $attributes[$attribute_name] = $matches[2][$key]; 
        } 
       } 
      } 

      return $attributes; 
     } 

    } 

?> 

用法爲:

$c = new HTMLQuery(); 
$x = $c->getElements('tr'); 
print_r($x);