我一直在努力刮刀一個月,並試圖廢除從MySQL獲取的鏈接(HREF)。廢棄更快的PHP遞歸刮板
我已經施加在其上的許多技術儘可能 我試圖與
- 與捲曲呼叫用於並行處理EXEC殼調用它。
- EXEC外殼調用PHP腳本的並行處理
- 嘗試並行線程這是不正確的,它的工作(不知道爲什麼)
我遞歸調用函數來抓住從網站,然後抓取鏈接這些鏈接進一步。 (過濾無效鏈接(#,javascript(無效)等)在大約30分鐘內5到60萬條記錄,它們很可能是重複的結果。如果我查詢不同的值從這個記錄,我只拿到了50,000條記錄
這裏是我的代碼
function multiRequest($urls) {
global $link;
$filter_links = array();
$rolling_window = sizeof($urls);
$master = curl_multi_init();
// add additional curl options here
$std_options = array(CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_CONNECTTIMEOUT => 35,
CURLOPT_HEADER => false,
CURLOPT_TIMEOUT => 30);
$options = $std_options;
// start the first batch of requests
for ($i = 0; $i < $rolling_window; $i++) {
$ch = curl_init();
$options[CURLOPT_URL] = $urls[$i];
$options[CURLOPT_PRIVATE] = $urls[$i];
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
}
do {
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
if ($execrun != CURLM_OK) {
break;
}
// a request was just completed -- find out which one
while ($done = curl_multi_info_read($master)) {
$available_curl = curl_getinfo($done['handle'], CURLINFO_PRIVATE);
$html = curl_multi_getcontent($done['handle']);
$domDoc = new DOMDocument('1.0');
@$domDoc->loadHTML($html);
$anchors = $domDoc->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
$href = rtrim($href, "/");
$href = trim($href);
if ((strpos($href, '#') !== false) || $href == '' || $href == $available_curl || (strpos($href, 'javascript:') !== false) || (strpos($href, 'index.php') !== false) || preg_match('/mailto:/', $href) || (strpos($href, '.jpg') !== false) || (strpos($href, '.jpeg') !== false) || (strpos($href, '.png') !== false) ||
(strpos($href, '.gif') !== false) || (strpos($href, '.tiff') !== false) || (strpos($href, '.tif') !== false) || (strpos($href, '.pdf') !== false)) {
continue;
}
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
$parts = parse_url($available_curl);
$href = $parts['scheme'] . '://';
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .=$path;
}
$href = rtrim($href, "/");
$filter_links[] = $href;
}
$filter_links = array_unique($filter_links);
$scraped_domain = remove_http($available_curl);
$scraped_domain_key = key_domain_generator($scraped_domain);
mysqli_query($link, "UPDATE domains SET is_scraped=1, total_scraped_links = '" . count($filter_links) . "' WHERE domain_u_key = '" . $scraped_domain_key . "'") or die(mysqli_error($link));
$namecheap_filter_internal_array=extrnl_intrnl_filter($filter_links, $available_curl);
curl_multi_remove_handle($master, $done['handle']);
}
} while ($running);
curl_multi_close($master);
if (count($namecheap_filter_internal_array) > 0) {
multiRequest($namecheap_filter_internal_array);
}
}
function extrnl_intrnl_filter($href_array, $domain_link) {
global $link;
$is_external = 0;
$workers = [];
$x_count=0;
foreach ($href_array as $href) {
$href_url = parse_url($href);
$href_domain = $href_url['host'];
$key_href = giveHost($href_domain);
if (isexternal($href_domain, $domain_link) == 'External') {
$domains_Query = "select count(*) as domain_found from domains where base_url='$key_href'";
$domains_run_Query = mysqli_query($link, $domains_Query) or die(mysqli_error($link));
$domaininfo = mysqli_fetch_assoc($domains_run_Query);
if ($domaininfo['domain_found'] > 0) {
} else {
if (preg_match('/^[-a-z0-9]+\.[a-z]{2,6}$/', strtolower($key_href))) {
$is_external = 1;
if (domain_insert_check($href, $is_external)) {
echo 'prgress';
$workers[$x_count] = new WorkerThreads($href);
$workers[$x_count]->start();
$x_count++;
//exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $domain_list_scrap . " > /dev/null 2> /dev/null &");
//exec("nohup php /var/www/test/tool2/index2.php " . $href . " > /dev/null 2> /dev/null &");
//exec("nohup php /var/www/test/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
//exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
}
}
}
} else {
$is_external = 0;
if (domain_insert_check($href, $is_external)) {
$workers[$x_count] = new WorkerThreads($href);
$workers[$x_count]->start();
$x_count++;
$namecheap_filter_internal_array[] = $href;
}
}
}
for ($forvar=0;$forvar<$x_count;$forvar++) {
$workers[$forvar]->join();
}
return array_unique($namecheap_filter_internal_array);
}
function domain_insert_check($href, $is_external) {
global $link;
$href_url = parse_url($href);
$href_ex_https = remove_http($href);
$href_domain = $href_url['host'];
$href_scheme = $href_url['scheme'];
$key_href_i = key_domain_generator($href_ex_https);
$query = "insert into domains set domain_name = '" . addslashes($href_ex_https) . "',"
. "doamin_schema = '" . $href_scheme . "',"
. "base_url = '" . strtolower(giveHost($href_domain)) . "',"
. "domain_u_key = '" . $key_href_i . "',"
. "is_expired = '0',"
. "is_scraped = '0',"
. "is_external = '" . $is_external . "',"
. "ExtBackLinks = '0',"
. "RefDomains='0',"
. "ACRank = '0',"
. "RefIPs = '0',"
. "RefSubNets = '0',"
. "RefDomainsEDU = '0',"
. "RefDomainsGOV = '0',"
. "Title = 'title',"
. "total_scraped_links = '0',"
. "CitationFlow = '0',"
. "TrustFlow = '0',"
. "TopicalTrustFlow_Topic_0 = 'TopicalTrustFlow_Topic_0',"
. "TopicalTrustFlow_Value_0 = '0',"
. "TopicalTrustFlow_Topic_1 = 'TopicalTrustFlow_Topic_1',"
. "TopicalTrustFlow_Value_1 = '0',"
. "TopicalTrustFlow_Topic_2 = 'TopicalTrustFlow_Topic_2',"
. "TopicalTrustFlow_Value_2 = '0',"
. "date_created = '" . date('Y-m-d H:i:s') . "',"
. "user_id = 1";
$result = mysqli_query($link, $query);
if (!$result) {
mysqli_query($link, "insert into domainerror SET error = '" . $key_href_i . "' , domains= '" . $href_ex_https . "', type='fail'");
return false;
} else {
return true;
}
}
我真的沒有任何想法如何,我可以優化它 ,以便它可以抓住更多的記錄,然後 我優化它,只要我能 如果我使用PHP調用,而不是捲曲,它會窒息MySQL最大連接 如果我使用pthread,它運行第一次,然後停止
一個友好的機器人不會試圖以最快的速度刮地盤。嘗試將其限制爲每分鐘幾頁。最後你會得到你需要的數據。考慮到robots.txt,並在瀏覽器中顯示你是誰,你是什麼。不要試圖成爲一個糟糕的機器人。 –