1
我正在使用DOMDocument()加載html頁面的文本。DOMDocument()花費太多時間來加載頁面
加載頁面需要很長時間。這是否意味着它也下載圖像?
任何替代方案或解決方案更快速地從url加載html頁面?
我使用的DOMDocument()以基本上提取元描述,標題文本,正文等
有工作碼解,將不勝感激。
<?php
set_time_limit(0);
include "connection.php";
error_reporting(E_ERROR | E_PARSE);
// Create a document instance
$doc = new DOMDocument();
if(!isset($_GET['url'])){
$_GET['url']=$urlfromdaemon;
$silentcrawl="set";
}
$doc->loadHTMLFile($_GET['url']);
$base_url=$_GET['url'];
$base_url = parse_url($base_url);
$base_url = 'http://'.$base_url['host'].'/';
//Searches for all elements with the "a" tag name
$tit = $doc->getElementsByTagName("a");
$urlarray=array();
$t=0;
foreach($tit AS $x){
$urlarray[$t]=$x->getAttribute('href');
$urlanchor[$t]=$x->nodeValue;
$t++;
}
//This makes the URL with spaces work correctly
for($i=0;$i<count($urlarray);$i++){
$urlarray[$i]= str_ireplace(" ","%20",$urlarray[$i]);
}
//
for($i=0;$i<count($urlarray);$i++){
$result=stristr(substr($urlarray[$i], 0, 7),"http://");
if($result==''){
if(stristr(substr($urlarray[$i], 0, 8),"https://")!=''){
}
else if(stristr(substr($urlarray[$i], 0, 2),"//")!=''){
$urlarray[$i]= 'http:'.$urlarray[$i];
}
else if(stristr(substr($urlarray[$i], 0, 4),"www.")==''){
//critical code section
$urlcheck='http://'.$urlarray[$i];
$headers = @get_headers($urlcheck, 1);
if ($headers === FALSE) { //Test for differentiate example.com with example .
if(substr($_GET['url'],-1)=='/'){
$urlarray[$i]= $_GET['url'].$urlarray[$i];
}
else{
if(parse_url($_GET['url'], PHP_URL_PATH)=='/'){
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i];
}
else if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-1)=='/'){
if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-7)=='http://'){
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i];
}
else{
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i];
}
}
else{
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i];
}
}
}
else {
$urlarray[$i]= 'http://'.$urlarray[$i];
}
//
}
else{
$urlarray[$i]='http://'.$urlarray[$i];
}
}
}
for($i=0;$i<count($urlarray);$i++){
$file = $urlarray[$i];
$file_headers = @get_headers($file);
if($file_headers[0] =='HTTP/1.1 404 Not Found') {
if(!isset($silentcrawl)){
//print_r($file_headers);
echo '<img style="width:20px;height:20px;float:left;" src="cross.png" > '.$urlarray[$i].'<br><Br>';
}
}
else {
if(!isset($silentcrawl)){
echo '<img style="width:20px;height:20px;float:left;" src="tick.png" > '.$urlarray[$i].'<br><br>';
}
//Insert Active Links into the database
$res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' ");
$count=mysqli_num_rows($res);
if($count==0){
$sql="INSERT INTO links (url,referer,anchor_pool)
VALUES ('$urlarray[$i]','$_GET[url]','$urlanchor[$i]')";
mysqli_query($con,$sql);
}
else{
$res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' ");
while($row=mysqli_fetch_array($res)){
$referers=explode(" ",$row['referer']);
$refcount=0;
for($j=0;$j<count($referers);$j++){
if($_GET['url']==$referers[$j]){
$refcount++;
//echo "same referer";
}
if($_GET['url']==$urlarray[$i]){
$refcount++;
//echo "same referer";
}
}
if($refcount<1){
$newreferer=$row['referer']." ".$_GET['url'];
$sql="update links set referer='$newreferer' where url='$urlarray[$i]' ";
mysqli_query($con,$sql);
$anchor=$row['anchor_pool'].' '.$urlanchor[$i];
$anchors=explode(" ",$anchor);
$anchors=array_unique($anchors);
$anchors=array_values($anchors);
$final_anchor=implode(' ',$anchors);
$sql="update links set anchor_pool='$final_anchor' where url='$urlarray[$i]' ";
mysqli_query($con,$sql);
}
}
}
}
}
$errors = array_filter($urlarray);
if (!empty($errors)) {
}
else{
echo "Either the URL is down or page contains no Links !, Try entering URL along with protocol used.";
}
$prime=$_GET['url'];
$res=mysqli_query($con,"SELECT * from links where url='$prime' ");
$count=mysqli_num_rows($res);
if($count==0){
$sql="INSERT INTO links (url,referer,anchor_pool,backlinks,status)
VALUES ('$_GET[url]','','','0','1')";
mysqli_query($con,$sql);
}
else{
$file_headers = @get_headers($prime);
if($file_headers[0] == 'HTTP/1.1 200 OK') {
$sql="update links set status='1' where url='$prime' ";
mysqli_query($con,$sql);
}
}
$res=mysqli_query($con,"SELECT * from links ");
while($row=mysqli_fetch_array($res)){
$bkarray=array_filter(explode(" ",$row['referer']));
for($i=0;$i<count($bkarray);$i++){
$base_url=parse_url($bkarray[$i]);
$bkarray[$i]=$base_url['host'];
}
$bkarray=array_unique($bkarray);
$bkarray=array_values($bkarray);
$bkarray=array_filter($bkarray);
$bk=count($bkarray);
$sql="update links set backlinks='$bk' where url='$row[url]' ";
mysqli_query($con,$sql);
}
?>
工作代碼?你甚至沒有顯示你的非工作的! –
DomDocument只加載頁面的標記。如果頁面加載速度慢,可能有幾個原因。連通性差,服務器速度慢等.DomDocument在我眼中運行速度非常快。 – Marcel
我已添加代碼 – user3475546