由於函數是很長,你其實可以跳過功能,因爲它們只從HTML標籤提取特定數據請嘗試進行如此規模一個問題並提供邏輯來幫助 我正在建立一個網站,可以挖掘到一個特定的網站,並提取幾乎完整的數據庫。實際功能非常長,將插入到4個不同的表格中,估計組合了60k行。我的問題是即使功能完成執行後,我可以看到新的行被添加到我的數據庫。我猜測MySQL插入速度低於我的迭代速度。MySQL的插入新行即使在功能已停止
控制器
public function start()
{
// set the loop running for ever
ini_set('MAX_EXECUTION_TIME', -1);
set_time_limit(0);
$lastId = $this->Kpkt_model->last_temp_id();
if($lastId == 0 OR $lastId == '' OR $lastId == '0')
{
$lastId = 0;
}
else
{
$lastId = $this->Kpkt_model->last_temp_id();
}
// add 1 to start the process
$i = $lastId+1;
// set the errors to 0
$errors = 0;
while($errors < 25)
{
usleep(100);
if($this->_scrap_all($i) == 'empty')
{
// its an empty record, add 1 to the errors
$errors++;
}
else
{
$errors = 0;
}
$i++;
}
}
function _scrap_all($i)
{
$url = "mydesiredwebsite.com?PMJU_KOD=$i";
$html = file_get_html($url);
// then check if the html element exists to avoid trying to parse non-html
$tag = $html->find('td[class=tdSecondtext1]');
// now we need to remove all the redundant spaces
$tag = preg_replace("/[[:blank:]]+/"," ",$tag);
// lets sanitize the dirty string from the html special characters
$tag = preg_replace("/&#?[a-z0-9]{2,8};/i","",$tag);
$name = strip_tags(str_replace("\n","", str_replace("\r","", $tag[0])));
$file = strip_tags(str_replace("\n","", str_replace("\r","", $tag[1])));
$roc = strip_tags(str_replace("\n","", str_replace("\r","", $tag[2])));
$address = strip_tags(str_replace("\n","", str_replace("\r","", $tag[3])));
$city = strip_tags(str_replace("\n","", str_replace("\r","", $tag[4])));
$postcode = strip_tags(str_replace("\n","", str_replace("\r","", $tag[5])));
$district = strip_tags(str_replace("\n","", str_replace("\r","", $tag[6])));
$state = strip_tags(str_replace("\n","", str_replace("\r","", $tag[7])));
$telephone = strip_tags(str_replace("\n","", str_replace("\r","", $tag[8])));
$fax = strip_tags(str_replace("\n","", str_replace("\r","", $tag[9])));
$website = strip_tags(str_replace("\n","", str_replace("\r","", $tag[10])));
$last_update = strip_tags(str_replace("\n","", str_replace("\r","", $tag[11])));
// check to see if this array contains data or has returned from the error page
if(strlen($name) < 4)
{
// here we can assume the project is empty by looking at it's file (unique id)
return 'empty';
}
else
{
/*
The project file is not empty, proceed to add to database
we need to convert this number into a date format
*/
$date = str_replace('/', '-', $last_update);
$last_update = date('Y-m-d', strtotime($date));
// begin insertion
$this->Kpkt_model->insert_company_temp($name, $file, $roc, $address, $city, $postcode, $district, $state, $telephone, $fax, $website, $last_update);
// echo "<span style='color:green'>company #<strong>$i</strong> added to database!</span><br/>";
// lets search the details
$links = $html->find('a[href^=DetailProjek.cfm]');
if (sizeof($links > 0))
{
foreach($links as $key=>$link)
{
// eurika!
$anchor = $link->getAttribute ('href');
// change the spaces to html notation
$anchor = str_replace(' ', '%20', $anchor);
$url = "mydesiredwebsite.com/$anchor";
$html2 = file_get_html($url);
$tag = $html2->find('td[class=tdSecondtext1]');
// now we need to remove all the redundant spaces
$tag = preg_replace("/[[:blank:]]+/"," ",$tag);
// lets sanitize the dirty string from the html special characters
$tag = preg_replace("/&#?[a-z0-9]{2,8};/i","",$tag);
// this is our foreign key
$developer = strip_tags(str_replace("\n","", str_replace("\r","", $tag[1])));
// first batch
$name = strip_tags(str_replace("\n","", str_replace("\r","", $tag[12])));
$file = strip_tags(str_replace("\n","", str_replace("\r","", $tag[13])));
$lot_no = strip_tags(str_replace("\n","", str_replace("\r","", $tag[14])));
$state = strip_tags(str_replace("\n","", str_replace("\r","", $tag[15])));
$housing_no = strip_tags(str_replace("\n","", str_replace("\r","", $tag[16])));
$bank_name = strip_tags(str_replace("\n","", str_replace("\r","", $tag[17])));
$license_no = strip_tags(str_replace("\n","", str_replace("\r","", $tag[18])));
$license_expire = strip_tags(str_replace("\n","", str_replace("\r","", $tag[19])));
$permit_no = strip_tags(str_replace("\n","", str_replace("\r","", $tag[20])));
$permit_expire = strip_tags(str_replace("\n","", str_replace("\r","", $tag[21])));
$land_status = strip_tags(str_replace("\n","", str_replace("\r","", $tag[22])));
$date1 = str_replace('/', '-', $license_expire);
$license_expire = date('Y-m-d', strtotime($date1));
$date2 = str_replace('/', '-', $permit_expire);
$permit_expire = date('Y-m-d', strtotime($date2));
$this->Kpkt_model->add_project_information_temp(
$developer,
$name,
$file,
$lot_no,
$state,
$housing_no,
$bank_name,
$license_no,
$license_expire,
$permit_no,
$permit_expire,
$land_status
);
$project_id = $this->Kpkt_model->get_last_id();
/*
delete the first 23 items of the array
since we dont know how many rows of data are under here, we deduct the company details and
primary project details
sizeof(array) - 23/12 = the number of rows we need to insert
MD 30/01/2013
*/
$newTags = array_slice($tag, 23);
// separate them into batches of 12
$newTags = array_chunk($newTags, 12);
// now we iterate through the array and add the details in
foreach($newTags AS $tag)
{
$category = strip_tags(str_replace("\n","", str_replace("\r","", $tag[0])));
$type = strip_tags(str_replace("\n","", str_replace("\r","", $tag[1])));
$storey = strip_tags(str_replace("\n","", str_replace("\r","", $tag[2])));
$floorArea = strip_tags(str_replace("\n","", str_replace("\r","", $tag[3])));
$totalArea = strip_tags(str_replace("\n","", str_replace("\r","", $tag[4])));
$units = strip_tags(str_replace("\n","", str_replace("\r","", $tag[5])));
$tcf = strip_tags(str_replace("\n","", str_replace("\r","", $tag[6])));
$cf = strip_tags(str_replace("\n","", str_replace("\r","", $tag[7])));
$priceMax = strip_tags(str_replace("\n","", str_replace("\r","", $tag[8])));
$priceStandard = strip_tags(str_replace("\n","", str_replace("\r","", $tag[9])));
$priceMin = strip_tags(str_replace("\n","", str_replace("\r","", $tag[10])));
$progressReport = strip_tags(str_replace("\n","", str_replace("\r","", $tag[11])));
$date1 = str_replace('/', '-', $tcf);
$tcf = date('Y-m-d', strtotime($date1));
$date2 = str_replace('/', '-', $cf);
$cf = date('Y-m-d', strtotime($date2));
$this->Kpkt_model->add_project_development_information_temp(
$developer,
$project_id,
$category,
$type,
$storey,
$floorArea,
$totalArea,
$units,
$tcf,
$cf,
$priceMax,
$priceStandard,
$priceMin,
$progressReport
);
}
// clean the memory
$sellingInfoLinks = $html2->find('a[href^=LaporanJualRumah.cfm]');
$html->clear();
unset($html2);
unset($tag);
unset($tags);
if(sizeof($sellingInfoLinks > 0))
{
foreach($sellingInfoLinks AS $key=>$selling)
{
// now we sift through the selling information
$anchor = $selling->getAttribute ('href');
// change the spaces to html notation
$anchor = str_replace(' ', '%20', $anchor);
$url = "mydesiredwebsite.com/$anchor";
$html3 = file_get_html($url);
$tag = $html3->find('tr[bgcolor!=#fc6535] td div font');
// now we need to remove all the redundant spaces
$tag = preg_replace("/[[:blank:]]+/"," ",$tag);
// lets sanitize the dirty string from the html special characters
$tag = preg_replace("/&#?[a-z0-9]{2,8};/i","",$tag);
// there are 12 items per array
$numRows = sizeof($tag)/12;
$tag = array_chunk($tag, 12);
foreach ($tag as $value)
{
// echo '<pre>';
// print_r($value);
// echo '</pre>';
// we break down the selling information into chunks of 12 to insert into the database, each bunch of 12 is one set of data
$company_id = $developer;
$project_id = $project_id;
$roomType = strip_tags(str_replace("\n","", str_replace("\r","", $value[0])));
$levels = strip_tags(str_replace("\n","", str_replace("\r","", $value[1])));
$local = strip_tags(str_replace("\n","", str_replace("\r","", $value[2])));
$chinse = strip_tags(str_replace("\n","", str_replace("\r","", $value[3])));
$indian = strip_tags(str_replace("\n","", str_replace("\r","", $value[4])));
$other = strip_tags(str_replace("\n","", str_replace("\r","", $value[5])));
$foreign = strip_tags(str_replace("\n","", str_replace("\r","", $value[6])));
$totalSold = strip_tags(str_replace("\n","", str_replace("\r","", $value[7])));
$totalUnsold = strip_tags(str_replace("\n","", str_replace("\r","", $value[8])));
$totalPerUnit = strip_tags(str_replace("\n","", str_replace("\r","", $value[9])));
$approvedUnits = strip_tags(str_replace("\n","", str_replace("\r","", $value[10])));
$developedUnits = strip_tags(str_replace("\n","", str_replace("\r","", $value[11])));
//echo sizeof($totalPerUnit);
$this->Kpkt_model->add_selling_information_temp(
$company_id,
$project_id,
$roomType,
$levels,
$local,
$chinse,
$indian,
$other,
$foreign,
$totalSold,
$totalUnsold,
$totalPerUnit,
$approvedUnits,
$developedUnits
);
# code...
}
$html3->clear();
unset($html3);
unset($tag);
}
}
}
}
}
// clean the memory
$html->clear();
unset($html);
unset($tag);
}
型號
function insert_company_temp($name, $file, $roc, $address, $city, $postcode, $district, $state, $telephone, $fax, $website, $last_update)
{
$data = array(
'file' => $file,
'name' => $name,
'roc' => $roc,
'address' => $address,
'city' => $city,
'postcode' => $postcode,
'district' => $district,
'state' => $state,
'telephone' => $telephone,
'fax' => $fax,
'website' => $website,
'last_update' => $last_update
);
$this->db->insert('kpkt_company_temp', $data);
//echo $this->db->last_query();
}
function last_id()
{
$query = "SELECT MAX(id) AS id FROM kpkt_company";
$res = $this->db->query($query);
return $res->row('id');
//echo $this->db->last_query();
}
function last_temp_id()
{
$query = "SELECT MAX(id) AS id FROM kpkt_company_temp";
$res = $this->db->query($query);
return $res->row('id');
//echo $this->db->last_query();
}
function add_project_information_temp(
$developer,
$name,
$file,
$lot_no,
$state,
$housing_no,
$bank_name,
$license_no,
$license_expire,
$permit_no,
$permit_expire,
$land_status
)
{
$data = array(
'developer_id' => $developer,
'name' => $name,
'file' => $file,
'lot_no' => $lot_no,
'state' => $state,
'housing_no' => $housing_no,
'bank_name' => $bank_name,
'license' => $license_no,
'license_expire' => $license_expire,
'permit_no' => $permit_no,
'permit_expire' => $permit_expire,
'land_status' => $land_status
);
$this->db->insert('kpkt_project_information_temp', $data);
}
function add_project_development_information_temp(
$developer,
$project_id,
$category,
$type,
$storey,
$floorArea,
$totalArea,
$units,
$tcf,
$cf,
$priceMax,
$priceStandard,
$priceMin,
$progressReport
)
{
$data = array(
'developer_id' => $developer,
'project_id' => $project_id,
'house_category' => $category,
'house_type' => $type,
'levels' => $storey,
'floor_area' => $floorArea,
'total_area' => $totalArea,
'units' => $units,
'tcf' => $tcf,
'cf' => $cf,
'price_max' => $priceMax,
'price_standard' => $priceStandard,
'price_min' => $priceMin,
'progress_report' => $progressReport
);
$this->db->insert('kpkt_project_development_information_temp', $data);
}
function add_selling_information_temp(
$company_id,
$project_id,
$roomType,
$levels,
$local,
$chinese,
$indian,
$other,
$foreign,
$totalSold,
$totalUnsold,
$totalPerUnit,
$approvedUnits,
$developedUnits
)
{
$data = array(
'developer_id' => $company_id,
'project_id' => $project_id,
'house_type' => $roomType,
'levels' => $levels,
'bumi' => $local,
'chinese' => $chinese,
'indian' => $indian,
'other' => $other,
'foreigner' => $foreign,
'units_sold' => $totalSold,
'units_unsold' => $totalUnsold,
'price_per_unit'=> $totalPerUnit,
'approved_units'=> $approvedUnits,
'developed_units'=> $developedUnits
);
$this->db->insert('kpkt_selling_information_temp', $data);
}
所以再次只是爲了澄清,一切都運行速度非常流暢,沒有任何問題,除了INSERT
語句繼續落後這麼說。我怎樣才能在function start()
中調節while
? 謝謝
說真的,誰會讀所有這些?如何粘貼代碼的相關部分? – Rob 2013-03-28 02:01:04
展示您的問題所需的最小代碼量是多少? – Patashu 2013-03-28 02:01:19
我試圖警告有關的大小:) @Patashu讓我修剪請 – 2013-03-28 02:02:19