2014-04-10 186 views
1

我正在使用DOMDocument()加載html頁面的文本。DOMDocument()花費太多時間來加載頁面

加載頁面需要很長時間。這是否意味着它也下載圖像?

任何替代方案或解決方案更快速地從url加載html頁面?

我使用的DOMDocument()以基本上提取元描述,標題文本,正文等

有工作碼

解,將不勝感激。

<?php 
set_time_limit(0); 
include "connection.php"; 

error_reporting(E_ERROR | E_PARSE); 
// Create a document instance 
    $doc = new DOMDocument(); 
    if(!isset($_GET['url'])){ 
    $_GET['url']=$urlfromdaemon; 
    $silentcrawl="set"; 
    } 



$doc->loadHTMLFile($_GET['url']); 
    $base_url=$_GET['url']; 
    $base_url = parse_url($base_url); 
    $base_url = 'http://'.$base_url['host'].'/'; 

    //Searches for all elements with the "a" tag name 
    $tit = $doc->getElementsByTagName("a"); 
    $urlarray=array(); 
    $t=0; 
    foreach($tit AS $x){ 
    $urlarray[$t]=$x->getAttribute('href'); 
    $urlanchor[$t]=$x->nodeValue; 
    $t++; 
} 
//This makes the URL with spaces work correctly 
for($i=0;$i<count($urlarray);$i++){ 
$urlarray[$i]= str_ireplace(" ","%20",$urlarray[$i]); 
} 
// 

for($i=0;$i<count($urlarray);$i++){ 
$result=stristr(substr($urlarray[$i], 0, 7),"http://"); 
if($result==''){ 

if(stristr(substr($urlarray[$i], 0, 8),"https://")!=''){ 

} 

else if(stristr(substr($urlarray[$i], 0, 2),"//")!=''){ 
$urlarray[$i]= 'http:'.$urlarray[$i]; 
} 

else if(stristr(substr($urlarray[$i], 0, 4),"www.")==''){ 
//critical code section 

$urlcheck='http://'.$urlarray[$i]; 
$headers = @get_headers($urlcheck, 1); 
if ($headers === FALSE) { //Test for differentiate example.com with example . 

if(substr($_GET['url'],-1)=='/'){ 
$urlarray[$i]= $_GET['url'].$urlarray[$i]; 
} 
else{ 
if(parse_url($_GET['url'], PHP_URL_PATH)=='/'){ 
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH)); 
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i]; 

} 
else if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-1)=='/'){ 

if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-7)=='http://'){ 
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH)); 
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i]; 
} 
else{ 
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH)); 
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i]; 
} 

} 

else{ 
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH)); 
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i]; 
} 
} 

} 
else { 

$urlarray[$i]= 'http://'.$urlarray[$i]; 

} 



// 
} 



else{ 
$urlarray[$i]='http://'.$urlarray[$i]; 

} 

} 

} 

for($i=0;$i<count($urlarray);$i++){ 

     $file = $urlarray[$i]; 
$file_headers = @get_headers($file); 

if($file_headers[0] =='HTTP/1.1 404 Not Found') { 
     if(!isset($silentcrawl)){ 
     //print_r($file_headers); 
    echo '<img style="width:20px;height:20px;float:left;" src="cross.png" > '.$urlarray[$i].'<br><Br>'; 
    } 
} 
else { 
if(!isset($silentcrawl)){ 
    echo '<img style="width:20px;height:20px;float:left;" src="tick.png" > '.$urlarray[$i].'<br><br>'; 
    } 
    //Insert Active Links into the database 
    $res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' "); 
$count=mysqli_num_rows($res); 
if($count==0){ 
$sql="INSERT INTO links (url,referer,anchor_pool) 
VALUES ('$urlarray[$i]','$_GET[url]','$urlanchor[$i]')"; 
mysqli_query($con,$sql); 
} 
else{ 
$res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' "); 
while($row=mysqli_fetch_array($res)){ 
$referers=explode(" ",$row['referer']); 
$refcount=0; 
for($j=0;$j<count($referers);$j++){ 
if($_GET['url']==$referers[$j]){ 
$refcount++; 
//echo "same referer"; 
} 
if($_GET['url']==$urlarray[$i]){ 
$refcount++; 
//echo "same referer"; 
} 
} 
if($refcount<1){ 
$newreferer=$row['referer']." ".$_GET['url']; 
$sql="update links set referer='$newreferer' where url='$urlarray[$i]' "; 
mysqli_query($con,$sql); 

$anchor=$row['anchor_pool'].' '.$urlanchor[$i]; 
$anchors=explode(" ",$anchor); 
$anchors=array_unique($anchors); 
$anchors=array_values($anchors); 
$final_anchor=implode(' ',$anchors); 

$sql="update links set anchor_pool='$final_anchor' where url='$urlarray[$i]' "; 
mysqli_query($con,$sql); 
} 


} 


} 


} 
} 

$errors = array_filter($urlarray); 

if (!empty($errors)) { 

} 
else{ 
echo "Either the URL is down or page contains no Links !, Try entering URL along with protocol used."; 
} 


$prime=$_GET['url']; 
$res=mysqli_query($con,"SELECT * from links where url='$prime' "); 
$count=mysqli_num_rows($res); 
if($count==0){ 
$sql="INSERT INTO links (url,referer,anchor_pool,backlinks,status) 
VALUES ('$_GET[url]','','','0','1')"; 
mysqli_query($con,$sql); 
} 
else{ 
$file_headers = @get_headers($prime); 
if($file_headers[0] == 'HTTP/1.1 200 OK') { 
$sql="update links set status='1' where url='$prime' "; 
mysqli_query($con,$sql); 
} 
} 


$res=mysqli_query($con,"SELECT * from links "); 
while($row=mysqli_fetch_array($res)){ 

$bkarray=array_filter(explode(" ",$row['referer'])); 
for($i=0;$i<count($bkarray);$i++){ 
$base_url=parse_url($bkarray[$i]); 
$bkarray[$i]=$base_url['host']; 
} 

$bkarray=array_unique($bkarray); 
$bkarray=array_values($bkarray); 
$bkarray=array_filter($bkarray); 

$bk=count($bkarray); 
$sql="update links set backlinks='$bk' where url='$row[url]' "; 
mysqli_query($con,$sql); 

} 

?> 
+0

工作代碼?你甚至沒有顯示你的非工作的! –

+0

DomDocument只加載頁面的標記。如果頁面加載速度慢,可能有幾個原因。連通性差,服務器速度慢等.DomDocument在我眼中運行速度非常快。 – Marcel

+0

我已添加代碼 – user3475546

回答

0

DOMDocument類是一個HTML/XML解析器。期。

您沒有共享的代碼可能會使用PHP流包裝來透明地使用與加載本地文件相同的語法通過HTTP下載遠程資源。這是完全不同的任務。據我所知,PHP不會將一個功能齊全的網絡爬蟲作爲其內置庫的一部分進行捆綁。

編輯:這裏就是完整的下載發生:該行後不被下載問題引起

$doc->loadHTMLFile($_GET['url']); 

一切。

相關問題