-2
我一直在使用以下腳本爲我的客戶網站創建站點地圖。問題是它不適用於每個站點。我發現,如果不是所有託管在godaddy上的網站都不是蜘蛛網。如果任何人都可以在我的腳本中看到錯誤,或知道是什麼導致錯誤,我將非常感謝幫助。php蜘蛛腳本不工作
在此先感謝
set_time_limit(0);
class spider_man
{
var $url;
var $limit;
var $cache;
var $crawled;
var $banned_ext;
var $domain;
function spider_man($url, $banned_ext, $limit){
$this->domain = $url;
$this->url = 'http://'.$url ;
$this->banned_ext = $banned_ext ;
$this->limit = $limit ;
if(!fopen($this->url, "r")) return false;
else $this->_spider($this->url);
}
function _spider($url){
$this->cache = @file_get_contents(urldecode($url));
if(!$this->cache) return false;
$this->crawled[] = urldecode($url) ;
preg_match_all("#href=\"(https?://[&=a-zA-Z0-9-_./]+)\"#si", $this->cache, $links);
if ($links) :
foreach ($links[1] as $hyperlink){
if(strpos($hyperlink,$this->domain)===false){ break; }
else{
$this->limit--;
if(! $this->limit) return;
if($this->is_valid_ext(trim($hyperlink)) and !$this->is_crawled($hyperlink)) :
$this->crawled[] = $hyperlink;
echo "Crawling $hyperlink<br />\n";
unset($this->cache);
$this->_spider($hyperlink);
endif;
}
}
endif;
}
function is_valid_ext($url){
foreach($this->banned_ext as $ext){
if($ext == substr($url, strlen($url) - strlen($ext))) return false;
}
return true;
}
function is_crawled($url){
return in_array($url, $this->crawled);
}
}
$banned_ext = array(".dtd",".css",".xml",".js",".gif",".jpg",".jpeg",".bmp",".ico",".rss",".pdf",".png",".psd",".aspx",".jsp",".srf",".cgi",".exe",".cfm");
$spider = new spider_man('domain.com', $banned_ext, 100);
print_r($spider->crawled);
什麼似乎是問題? –
當您嘗試在GoDaddy網站上運行它時會發生什麼? – andrewsi
這些網站不返回任何網址。它只是返回一個空數組 – James