因爲抓取網頁可能會花費很多時間,所以我想讓pcntl_fork()幫助我創建多個子元素來分割我的代碼。pcntl_fork()函數
- 大師 - 爬行域
- 孩子 - 當接收到一個鏈接孩子必須匍匐在域中找到
- 兒童的聯繫 - 接收新的鏈接時,必須做同樣爲2。
我可以做多少我想要的,還是我必須設置最多的孩子?
這裏是我的代碼:
class MyCrawler extends PHPCrawler
{
function handlePageData(&$page_data)
{ // CHECK DOMEIN
$domain = $_POST['domain'];
$keywords = $_POST['keywords'];
//$tags = get_meta_tags($page_data["url"]);
//$iKeyFound = null;
$find = $keywords;
$str = file_get_contents($page_data["url"]);
if(strpos($str, $find) == true && $page_data["received"] == true)
{
$keywords = $_POST['keywords'];
if($page_data["header"]){
echo "<table border='1' >";
echo "<tr><td width='300'>Status:</td><td width='500'> ".strtok($page_data["header"], "\n")."</td></tr>";}
else "<table border='1' >";
// PRINT EERSTE LIJN
echo "<tr><td>Page requested:</td><td> ".$page_data["url"]."</td></tr>";
// PRINT STATUS WEBSITE
// PRINT WEBPAGINA
echo "<tr><td>Referer-page:</td><td> ".$page_data["referer_url"]."</td></tr>";
// CONTENT ONTVANGEN?
if ($page_data["received"]==true)
echo "<tr><td>Content received: </td><td>".$page_data["bytes_received"]/8 . " Kbytes</td></tr></table>";
else
echo "<tr><td>Content:</td><td> Not received</td></tr></table>";
$domain = $_POST['domain'];
$link = mysql_connect('localhost', 'crawler', 'DRZOIDBERGGG');
if (!$link)
{
die('Could not connect: ' . mysql_error());
}
mysql_select_db("crawler");
if(empty($page_data["referer_url"]))
$page_data["referer_url"] = $page_data["url"];
strip_tags($str, '<p><b>');
$matches = $keywords;
//$match = preg_match_all("'/<(*.?)(*.?)>(*.?)'".$keywords."'(*.?)<\/($1)>/'", $str, $matches, PREG_SET_ORDER);
//echo $match;
$doc = new DOMDocument();
$doc->loadHTML($str);
$xPath = new DOMXpath($doc);
$xPathQuery = "//text()[contains(translate(.,'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), '".strtoupper($keywords)."')]";
$elements = $xPath->query($xPathQuery);
if($elements->length > 0){
foreach($elements as $element){
print "Gevonden: " .$element->nodeValue."<br />";
}}
$result = mysql_query("SELECT * FROM crawler WHERE data = '".$element->nodeValue."' ") ;
if(mysql_num_rows($result)>0)
echo 'Column already exist';
else{
echo 'added';
mysql_query("INSERT INTO crawler (id, domain, url, keywords, data) VALUES ('', '".$page_data["referer_url"]."', '".$page_data["url"]."', '".$keywords."', '".$element->nodeValue. "')");
}
echo '<br>';
echo "<br><br>";
echo str_pad(" ", 5000); // "Force flush", workaround
flush();
}
忘了說:我需要一場勝利X(86),32位的解決方法!
因爲我的客戶端不支持它。
評論2全部:在代碼中尋找有趣的單詞並贏得一些大獎! – Jordy 2010-09-14 07:43:28
'DRZOIDBERGGG'? – 2010-09-21 20:25:59