我會建議不使用DOM來解析HTML,因爲它有無效的HTML問題。而使用正則表達式
我使用這個類:
<?php
/**
* Class to return HTML elements from a HTML document
* @version 0.3.1
*/
class HTMLQuery
{
protected $selfClosingTags = array('area', 'base', 'br', 'hr', 'img', 'input', 'link', 'meta', 'param');
private $html;
function __construct($html = false)
{
if($html !== false)
$this->load($html);
}
/**
* Load a HTML string
*/
public function load($html)
{
$this->html = $html;
}
/**
* Returns elements from the HTML
*/
public function getElements($element, $attribute_match = false, $value_match = false)
{
if(in_array($element, $this->selfClosingTags))
preg_match_all("/<$element *(.*)*\/>/isU", $this->html, $matches);
else
preg_match_all("/<$element(.*)>(.*)<\/$element>/isU", $this->html, $matches);
if($matches)
{
#Create an array of matched elements with attributes and content
foreach($matches[0] as $key => $el)
{
$current_el = array('name' => $element);
$attributes = $this->parseAttributes($matches[1][$key]);
if($attributes)
$current_el['attributes'] = $attributes;
if($matches[2][$key])
$current_el['content'] = $matches[2][$key];
$elements[] = $current_el;
}
#Return only elements with a specific attribute and or value if specified
if($attribute_match != false && $elements)
{
foreach($elements as $el_key => $current_el)
{
if($current_el['attributes'])
{
foreach($current_el['attributes'] as $att_name => $att_value)
{
$keep = false;
if($att_name == $attribute_match)
{
$keep = true;
if($value_match == false)
break;
}
if($value_match && ($att_value == $value_match))
{
$keep = true;
break;
}
elseif($value_match && ($att_value != $value_match))
$keep = false;
}
if($keep == false)
unset($elements[$el_key]);
}
else
unset($elements[$el_key]);
}
}
}
if($elements)
return array_values($elements);
else
return array();
}
/**
* Return an associateive array of all the form inputs
*/
public function getFormValues()
{
$inputs = $this->getElements('input');
$textareas = $this->getElements('textarea');
$buttons = $this->getElements('button');
$elements = array_merge($inputs, $textareas, $buttons);
if($elements)
{
foreach($elements as $current_el)
{
$attribute_name = mb_strtolower($current_el['attributes']['name']);
if(in_array($current_el['name'], array('input', 'button')))
{
if(isset($current_el['attributes']['name']) && isset($current_el['attributes']['value']))
$form_values[$attribute_name] = $current_el['attributes']['value'];
}
else
{
if(isset($current_el['attributes']['name']) && isset($current_el['content']))
$form_values[$attribute_name] = $current_el['content'];
}
}
}
return $form_values;
}
/**
* Parses attributes into an array
*/
private function parseAttributes($str)
{
$str = trim(rtrim(trim($str), '/'));
if($str)
{
preg_match_all("/([^ =]+)\s*=\s*[\"'「」]{0,1}([^\"'「」]*)[\"'「」]{0,1}/i", $str, $matches);
if($matches[1])
{
foreach($matches[1] as $key => $att)
{
$attribute_name = mb_strtolower($att);
$attributes[$attribute_name] = $matches[2][$key];
}
}
}
return $attributes;
}
}
?>
用法爲:
$c = new HTMLQuery();
$x = $c->getElements('tr');
print_r($x);