2016-06-25 70 views
0

我一直在努力刮刀一個月,並試圖廢除從MySQL獲取的鏈接(HREF)。廢棄更快的PHP遞歸刮板

我已經施加在其上的許多技術儘可能 我試圖與

  1. 與捲曲呼叫用於並行處理EXEC殼調用它。
  2. EXEC外殼調用PHP腳本的並行處理
  3. 嘗試並行線程這是不正確的,它的工作(不知道爲什麼)

我遞歸調用函數來抓住從網站,然後抓取鏈接這些鏈接進一步。 (過濾無效鏈接(#,javascript(無效)等)在大約30分鐘內5到60萬條記錄,它們很可能是重複的結果。如果我查詢不同的值從這個記錄,我只拿到了50,000條記錄

這裏是我的代碼

function multiRequest($urls) { 

global $link; 



$filter_links = array(); 
$rolling_window = sizeof($urls); 

$master = curl_multi_init(); 


// add additional curl options here 
$std_options = array(CURLOPT_RETURNTRANSFER => true, 
    CURLOPT_FOLLOWLOCATION => true, 
    CURLOPT_CONNECTTIMEOUT => 35, 
    CURLOPT_HEADER => false, 
    CURLOPT_TIMEOUT => 30); 
$options = $std_options; 

// start the first batch of requests 
for ($i = 0; $i < $rolling_window; $i++) { 
    $ch = curl_init(); 
    $options[CURLOPT_URL] = $urls[$i]; 
    $options[CURLOPT_PRIVATE] = $urls[$i]; 
    curl_setopt_array($ch, $options); 
    curl_multi_add_handle($master, $ch); 
} 

do { 
    while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM); 
    if ($execrun != CURLM_OK) { 
     break; 
    } 
    // a request was just completed -- find out which one 
    while ($done = curl_multi_info_read($master)) { 

     $available_curl = curl_getinfo($done['handle'], CURLINFO_PRIVATE); 

     $html = curl_multi_getcontent($done['handle']); 

     $domDoc = new DOMDocument('1.0'); 
     @$domDoc->loadHTML($html); 

     $anchors = $domDoc->getElementsByTagName('a'); 
     foreach ($anchors as $element) { 
      $href = $element->getAttribute('href'); 
      $href = rtrim($href, "/"); 
      $href = trim($href); 

      if ((strpos($href, '#') !== false) || $href == '' || $href == $available_curl || (strpos($href, 'javascript:') !== false) || (strpos($href, 'index.php') !== false) || preg_match('/mailto:/', $href) || (strpos($href, '.jpg') !== false) || (strpos($href, '.jpeg') !== false) || (strpos($href, '.png') !== false) || 
        (strpos($href, '.gif') !== false) || (strpos($href, '.tiff') !== false) || (strpos($href, '.tif') !== false) || (strpos($href, '.pdf') !== false)) { 
       continue; 
      } 
      if (0 !== strpos($href, 'http')) { 
       $path = '/' . ltrim($href, '/'); 


       $parts = parse_url($available_curl); 

       $href = $parts['scheme'] . '://'; 

       $href .= $parts['host']; 
       if (isset($parts['port'])) { 
        $href .= ':' . $parts['port']; 
       } 
       $href .=$path; 
      } 


       $href = rtrim($href, "/"); 
       $filter_links[] = $href; 

     } 

     $filter_links = array_unique($filter_links); 
     $scraped_domain = remove_http($available_curl); 
     $scraped_domain_key = key_domain_generator($scraped_domain); 
     mysqli_query($link, "UPDATE domains SET is_scraped=1, total_scraped_links = '" . count($filter_links) . "' WHERE domain_u_key = '" . $scraped_domain_key . "'") or die(mysqli_error($link)); 
     $namecheap_filter_internal_array=extrnl_intrnl_filter($filter_links, $available_curl); 

     curl_multi_remove_handle($master, $done['handle']); 
    } 
} while ($running); 

curl_multi_close($master); 
if (count($namecheap_filter_internal_array) > 0) { 

    multiRequest($namecheap_filter_internal_array); 
} 

}

function extrnl_intrnl_filter($href_array, $domain_link) { 

global $link; 
$is_external = 0; 
$workers = []; 
$x_count=0; 
foreach ($href_array as $href) { 
    $href_url = parse_url($href); 
    $href_domain = $href_url['host']; 
    $key_href = giveHost($href_domain); 
    if (isexternal($href_domain, $domain_link) == 'External') { 
     $domains_Query = "select count(*) as domain_found from domains where base_url='$key_href'"; 
     $domains_run_Query = mysqli_query($link, $domains_Query) or die(mysqli_error($link)); 
     $domaininfo = mysqli_fetch_assoc($domains_run_Query); 
     if ($domaininfo['domain_found'] > 0) { 

     } else { 
      if (preg_match('/^[-a-z0-9]+\.[a-z]{2,6}$/', strtolower($key_href))) { 
       $is_external = 1; 
       if (domain_insert_check($href, $is_external)) { 
        echo 'prgress'; 
        $workers[$x_count] = new WorkerThreads($href); 
        $workers[$x_count]->start(); 
        $x_count++; 


        //exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $domain_list_scrap . " > /dev/null 2> /dev/null &"); 
        //exec("nohup php /var/www/test/tool2/index2.php " . $href . " > /dev/null 2> /dev/null &"); 

        //exec("nohup php /var/www/test/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &"); 
        //exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &"); 
       } 
      } 
     } 
    } else { 
     $is_external = 0; 
     if (domain_insert_check($href, $is_external)) { 
      $workers[$x_count] = new WorkerThreads($href); 
      $workers[$x_count]->start(); 
      $x_count++; 
      $namecheap_filter_internal_array[] = $href; 

     } 
    } 
} 
for ($forvar=0;$forvar<$x_count;$forvar++) { 
    $workers[$forvar]->join(); 
} 

return array_unique($namecheap_filter_internal_array); 

}

function domain_insert_check($href, $is_external) { 
global $link; 
$href_url = parse_url($href); 
$href_ex_https = remove_http($href); 
$href_domain = $href_url['host']; 
$href_scheme = $href_url['scheme']; 
$key_href_i = key_domain_generator($href_ex_https); 

$query = "insert into domains set domain_name = '" . addslashes($href_ex_https) . "'," 
     . "doamin_schema = '" . $href_scheme . "'," 
     . "base_url = '" . strtolower(giveHost($href_domain)) . "'," 
     . "domain_u_key = '" . $key_href_i . "'," 
     . "is_expired = '0'," 
     . "is_scraped = '0'," 
     . "is_external = '" . $is_external . "'," 
     . "ExtBackLinks = '0'," 
     . "RefDomains='0'," 
     . "ACRank = '0'," 
     . "RefIPs = '0'," 
     . "RefSubNets = '0'," 
     . "RefDomainsEDU = '0'," 
     . "RefDomainsGOV = '0'," 
     . "Title = 'title'," 
     . "total_scraped_links = '0'," 
     . "CitationFlow = '0'," 
     . "TrustFlow = '0'," 
     . "TopicalTrustFlow_Topic_0 = 'TopicalTrustFlow_Topic_0'," 
     . "TopicalTrustFlow_Value_0 = '0'," 
     . "TopicalTrustFlow_Topic_1 = 'TopicalTrustFlow_Topic_1'," 
     . "TopicalTrustFlow_Value_1 = '0'," 
     . "TopicalTrustFlow_Topic_2 = 'TopicalTrustFlow_Topic_2'," 
     . "TopicalTrustFlow_Value_2 = '0'," 
     . "date_created = '" . date('Y-m-d H:i:s') . "'," 
     . "user_id = 1"; 

$result = mysqli_query($link, $query); 
if (!$result) { 
    mysqli_query($link, "insert into domainerror SET error = '" . $key_href_i . "' , domains= '" . $href_ex_https . "', type='fail'"); 

    return false; 
} else { 
    return true; 
} 

}

我真的沒有任何想法如何,我可以優化它 ,以便它可以抓住更多的記錄,然後 我優化它,只要我能 如果我使用PHP調用,而不是捲曲,它會窒息MySQL最大連接 如果我使用pthread,它運行第一次,然後停止

+0

一個友好的機器人不會試圖以最快的速度刮地盤。嘗試將其限制爲每分鐘幾頁。最後你會得到你需要的數據。考慮到robots.txt,並在瀏覽器中顯示你是誰,你是什麼。不要試圖成爲一個糟糕的機器人。 –

回答

0

我的第一個建議是將刪除DOMDocument替換爲正則表達式,這是更好,更快,內存佔用更少,時間更短解析。

其他較小的改進將是用o(1)代替子陣列搜索,如果可能的話使用散列圖。

$filter_links = array_unique($filter_links); 

因此,你應該有一個$ urlMap [$ urlKey] = $ url; 如果您沒有找到它,請繼續插入它。計算密鑰的快速方法可能是使用md5,但有更快的方法。

從我看到的另一個大I/O問題是,您爲每個抓取的網站插入數據庫。而不是這樣做,你可以用數據分成另一個數組,最後將所有的網站數據插入你的sql服務器。

不過,您將獲得一些加速,但爲了擴展,您必須考慮將流程拆分爲多個服務器的方法。爲此,您需要一個隊列系統,您可以使用RabbitMq https://www.rabbitmq.com/