1
This是我試圖解析使用簡單的Html Dom的頁面。我已經完成了90%的功能,但由於我是圖書館的新手,我不太確定要這樣做。使用簡單的Html Dom刪除一些元素
我想刮文本每條新聞的,但由於文字是<p>
元素中,使用類似->innertext
帶來裏面的一切,包括鏈接。
這是我已經試過:
<h1>Scraper Noticias</h1>
<?php
include('simple_html_dom.php');
class News {
var $image;
var $fechanoticia;
var $title;
var $description;
var $sourceurl;
function get_image() {
return $this->image;
}
function set_image ($new_image) {
$this->image = $new_image;
}
function get_fechanoticia() {
return $this->fechanoticia;
}
function set_fechanoticia ($new_fechanoticia) {
$this->fechanoticia = $new_fechanoticia;
}
function get_title() {
return $this->title;
}
function set_title ($new_title) {
$this->title = $new_title;
}
function get_description() {
return $this->description;
}
function set_description ($new_description) {
$this->description = $new_description;
}
function get_sourceurl() {
return $this->sourceurl;
}
function set_sourceurl ($new_sourceurl) {
$this->sourceurl = $new_sourceurl;
}
}
// Create DOM from URL or file
$html = file_get_html('http://www.uvm.cl/noticias_mas.shtml');
$parsedNews = array();
// Find all news items.
foreach($html->find('#cont2 p') as $element) {
$newItem = new News;
// Parse the news item's thumbnail image.
foreach ($element->find('img') as $image) {
$newItem->set_image($image->src);
//echo $newItem->get_image() . "<br />";
}
// Parse the news item's post date.
foreach ($element->find('span.fechanoticia') as $fecha) {
$newItem->set_fechanoticia($fecha->innertext);
//echo $newItem->get_fechanoticia() . "<br />";
}
// Parse the news item's title.
foreach ($element->find('a') as $title) {
$newItem->set_title($title->innertext);
//echo $newItem->get_title() . "<br />";
}
// Parse the news item's source URL link.
foreach ($element->find('a') as $sourceurl) {
$newItem->set_sourceurl("http://www.uvm.cl/" . $sourceurl->href);
}
// Parse the news items' description text.
echo $link; //This is the entire <p> tag. How can I get just the text. Not the link?
}
?>
我只是測試它,它返回7個環節。你只是想要的文字和剝離鏈接? – 2012-07-26 22:50:33
@保羅:沒錯。 :)這正是我遇到的問題。我想要的文字,沒有鏈接。 – 2012-07-26 22:51:34
見下文...... – 2012-07-26 22:59:50