0
我是編程的新手,所以這是我的問題。我正在嘗試構建一個遞歸的php spider usind簡單的HTML DOM解析器,爬入某個網站並返回包含2xx,3xx,4xx和5xx的頁面列表。我一直在尋找解決方案的幾天,但(可能是由於我的經驗不足),我還沒有找到任何工作。我的實際代碼找到根/索引頁面上的所有鏈接,但是我希望能夠找到那些先前找到的鏈接內的遞歸等鏈接,例如到第5級。假設根頁面是第0級,遞歸函數我只寫了1級鏈接,重複5次。任何幫助讚賞。謝謝。使用簡單的HTML查找嵌套鏈接DOM(遞歸)
<?php
echo "<strong><h1>Sitemap</h1></strong><br>";
include_once('simple_html_dom.php');
$url = "http://www.gnet.it/";
$html = new simple_html_dom();
$html->load_file($url);
echo "<strong><h2>Int Links</h2></strong><br>";
foreach($html->find("a") as $a)
{
if((!(preg_match('#^(?:https?|ftp)://.+$#', $a->href)))&&($a->href != null)&&($a->href != "javascript:;")&&($a->href != "#"))
{
echo "<strong>" . $a->href . "</strong><br>";
}
}
echo "<strong><h2>Ext Links</h2></strong><br>";
foreach($html->find("a") as $a)
{
if(((preg_match('#^(?:https?|ftp)://.+$#', $a->href)))&&($a->href != null)&&($a->href != "javascript:;")&&($a->href != "#"))
{
echo "<strong>" . $a->href . "</strong><br>";
}
}
//recursion
$depth = 1;
$maxDepth = 5;
$recurl = "$a->href";
$rechtml = new simple_html_dom();
$rechtml->load_file($recurl);
while($depth <= $maxDepth){
echo "<strong><h2>Link annidati livello $depth</h2></strong><br>";
foreach($rechtml->find("a") as $a)
{
if(($a->href != null))
{
echo "<strong>" . $a->href . "</strong><br>";
}
}
$depth++;
}
//csv
echo "<strong><h1>Google Crawl Errors from CSV</h1></strong><br>";
echo "<table>\n\n";
$f = fopen("CrawlErrors.csv", "r");
while (($line = fgetcsv($f)) !== false) {
echo "<tr>";
foreach ($line as $cell) {
echo "<td>" . htmlspecialchars($cell) . "</td>";
}
echo "</tr>\n";
}
fclose($f);
echo "\n</table>";
?>