2013-03-28 22 views
-2

由於函數是很長,你其實可以跳過功能,因爲它們只從HTML標籤提取特定數據請嘗試進行如此規模一個問題並提供邏輯來幫助 我正在建立一個網站,可以挖掘到一個特定的網站,並提取幾乎完整的數據庫。實際功能非常長,將插入到4個不同的表格中,估計組合了60k行。我的問題是即使功能完成執行後,我可以看到新的行被添加到我的數據庫。我猜測MySQL插入速度低於我的迭代速度。MySQL的插入新行即使在功能已停止

控制器

public function start() 
{ 
    // set the loop running for ever 
    ini_set('MAX_EXECUTION_TIME', -1); 
    set_time_limit(0);   
    $lastId = $this->Kpkt_model->last_temp_id(); 
    if($lastId == 0 OR $lastId == '' OR $lastId == '0') 
    { 
     $lastId = 0; 
    } 
    else 
    { 
     $lastId = $this->Kpkt_model->last_temp_id(); 
    } 
    // add 1 to start the process 
    $i = $lastId+1; 
    // set the errors to 0 
    $errors = 0; 

    while($errors < 25) 
    { 
     usleep(100); 
     if($this->_scrap_all($i) == 'empty') 
     { 
      // its an empty record, add 1 to the errors 
      $errors++; 

     } 
     else 
     { 
      $errors = 0; 
     } 
     $i++; 
    } 

} 



function _scrap_all($i) 
{ 

    $url = "mydesiredwebsite.com?PMJU_KOD=$i"; 

    $html = file_get_html($url); 
    // then check if the html element exists to avoid trying to parse non-html 

    $tag = $html->find('td[class=tdSecondtext1]'); 

    // now we need to remove all the redundant spaces 
    $tag = preg_replace("/[[:blank:]]+/"," ",$tag); 
    // lets sanitize the dirty string from the html special characters 
    $tag = preg_replace("/&#?[a-z0-9]{2,8};/i","",$tag); 

    $name = strip_tags(str_replace("\n","", str_replace("\r","", $tag[0]))); 

    $file = strip_tags(str_replace("\n","", str_replace("\r","", $tag[1]))); 
    $roc = strip_tags(str_replace("\n","", str_replace("\r","", $tag[2]))); 
    $address = strip_tags(str_replace("\n","", str_replace("\r","", $tag[3]))); 
    $city = strip_tags(str_replace("\n","", str_replace("\r","", $tag[4]))); 
    $postcode = strip_tags(str_replace("\n","", str_replace("\r","", $tag[5]))); 
    $district = strip_tags(str_replace("\n","", str_replace("\r","", $tag[6]))); 
    $state = strip_tags(str_replace("\n","", str_replace("\r","", $tag[7]))); 
    $telephone = strip_tags(str_replace("\n","", str_replace("\r","", $tag[8]))); 
    $fax = strip_tags(str_replace("\n","", str_replace("\r","", $tag[9]))); 
    $website = strip_tags(str_replace("\n","", str_replace("\r","", $tag[10]))); 
    $last_update = strip_tags(str_replace("\n","", str_replace("\r","", $tag[11]))); 

    // check to see if this array contains data or has returned from the error page 
    if(strlen($name) < 4) 
    { 
     // here we can assume the project is empty by looking at it's file (unique id) 

     return 'empty'; 
    } 
    else 
    { 
     /* 
     The project file is not empty, proceed to add to database 
     we need to convert this number into a date format 
     */ 
     $date = str_replace('/', '-', $last_update); 

     $last_update = date('Y-m-d', strtotime($date)); 

     // begin insertion 
     $this->Kpkt_model->insert_company_temp($name, $file, $roc, $address, $city, $postcode, $district, $state, $telephone, $fax, $website, $last_update);        
     // echo "<span style='color:green'>company #<strong>$i</strong> added to database!</span><br/>"; 
     // lets search the details 

     $links = $html->find('a[href^=DetailProjek.cfm]'); 
     if (sizeof($links > 0)) 
     { 


      foreach($links as $key=>$link) 
      { 
       // eurika! 
       $anchor = $link->getAttribute ('href'); 
       // change the spaces to html notation 
       $anchor = str_replace(' ', '%20', $anchor); 

       $url = "mydesiredwebsite.com/$anchor"; 
       $html2 = file_get_html($url); 

       $tag = $html2->find('td[class=tdSecondtext1]'); 

       // now we need to remove all the redundant spaces 
       $tag = preg_replace("/[[:blank:]]+/"," ",$tag); 
       // lets sanitize the dirty string from the html special characters 
       $tag = preg_replace("/&#?[a-z0-9]{2,8};/i","",$tag); 

       // this is our foreign key 
       $developer = strip_tags(str_replace("\n","", str_replace("\r","", $tag[1]))); 

       // first batch 
       $name = strip_tags(str_replace("\n","", str_replace("\r","", $tag[12]))); 
       $file = strip_tags(str_replace("\n","", str_replace("\r","", $tag[13]))); 
       $lot_no = strip_tags(str_replace("\n","", str_replace("\r","", $tag[14]))); 
       $state = strip_tags(str_replace("\n","", str_replace("\r","", $tag[15]))); 
       $housing_no = strip_tags(str_replace("\n","", str_replace("\r","", $tag[16]))); 
       $bank_name = strip_tags(str_replace("\n","", str_replace("\r","", $tag[17]))); 
       $license_no = strip_tags(str_replace("\n","", str_replace("\r","", $tag[18]))); 
       $license_expire = strip_tags(str_replace("\n","", str_replace("\r","", $tag[19]))); 
       $permit_no = strip_tags(str_replace("\n","", str_replace("\r","", $tag[20]))); 
       $permit_expire = strip_tags(str_replace("\n","", str_replace("\r","", $tag[21]))); 
       $land_status = strip_tags(str_replace("\n","", str_replace("\r","", $tag[22]))); 

       $date1 = str_replace('/', '-', $license_expire); 
       $license_expire = date('Y-m-d', strtotime($date1)); 

       $date2 = str_replace('/', '-', $permit_expire); 
       $permit_expire = date('Y-m-d', strtotime($date2));  

       $this->Kpkt_model->add_project_information_temp(
        $developer, 
        $name, 
        $file, 
        $lot_no, 
        $state, 
        $housing_no, 
        $bank_name, 
        $license_no, 
        $license_expire, 
        $permit_no, 
        $permit_expire, 
        $land_status 

        ); 

       $project_id = $this->Kpkt_model->get_last_id();      

       /* 
        delete the first 23 items of the array 
        since we dont know how many rows of data are under here, we deduct the company details and 
        primary project details 
        sizeof(array) - 23/12 = the number of rows we need to insert 
        MD 30/01/2013 
       */ 

       $newTags = array_slice($tag, 23); 
       // separate them into batches of 12 
       $newTags = array_chunk($newTags, 12); 
       // now we iterate through the array and add the details in 
       foreach($newTags AS $tag) 
       { 

        $category = strip_tags(str_replace("\n","", str_replace("\r","", $tag[0]))); 
        $type = strip_tags(str_replace("\n","", str_replace("\r","", $tag[1]))); 
        $storey = strip_tags(str_replace("\n","", str_replace("\r","", $tag[2]))); 
        $floorArea = strip_tags(str_replace("\n","", str_replace("\r","", $tag[3]))); 
        $totalArea = strip_tags(str_replace("\n","", str_replace("\r","", $tag[4]))); 
        $units = strip_tags(str_replace("\n","", str_replace("\r","", $tag[5]))); 
        $tcf = strip_tags(str_replace("\n","", str_replace("\r","", $tag[6]))); 
        $cf = strip_tags(str_replace("\n","", str_replace("\r","", $tag[7]))); 
        $priceMax = strip_tags(str_replace("\n","", str_replace("\r","", $tag[8]))); 
        $priceStandard = strip_tags(str_replace("\n","", str_replace("\r","", $tag[9]))); 
        $priceMin = strip_tags(str_replace("\n","", str_replace("\r","", $tag[10]))); 
        $progressReport = strip_tags(str_replace("\n","", str_replace("\r","", $tag[11]))); 

        $date1 = str_replace('/', '-', $tcf); 
        $tcf = date('Y-m-d', strtotime($date1)); 

        $date2 = str_replace('/', '-', $cf); 
        $cf = date('Y-m-d', strtotime($date2)); 

        $this->Kpkt_model->add_project_development_information_temp(
         $developer, 
         $project_id, 
         $category, 
         $type, 
         $storey, 
         $floorArea, 
         $totalArea, 
         $units, 
         $tcf, 
         $cf, 
         $priceMax, 
         $priceStandard, 
         $priceMin, 
         $progressReport         
         ); 
       } 

       // clean the memory 
       $sellingInfoLinks = $html2->find('a[href^=LaporanJualRumah.cfm]'); 
       $html->clear(); 
       unset($html2); 
       unset($tag); 
       unset($tags); 


       if(sizeof($sellingInfoLinks > 0)) 
       { 
        foreach($sellingInfoLinks AS $key=>$selling) 
        { 
         // now we sift through the selling information 
         $anchor = $selling->getAttribute ('href'); 
         // change the spaces to html notation 
         $anchor = str_replace(' ', '%20', $anchor); 

         $url = "mydesiredwebsite.com/$anchor"; 
         $html3 = file_get_html($url); 

         $tag = $html3->find('tr[bgcolor!=#fc6535] td div font'); 

         // now we need to remove all the redundant spaces 
         $tag = preg_replace("/[[:blank:]]+/"," ",$tag); 
         // lets sanitize the dirty string from the html special characters 
         $tag = preg_replace("/&#?[a-z0-9]{2,8};/i","",$tag); 

         // there are 12 items per array 
         $numRows = sizeof($tag)/12; 


         $tag = array_chunk($tag, 12); 
         foreach ($tag as $value) 
         { 
          // echo '<pre>'; 
          // print_r($value); 
          // echo '</pre>'; 
          // we break down the selling information into chunks of 12 to insert into the database, each bunch of 12 is one set of data 
          $company_id = $developer; 
          $project_id = $project_id; 

          $roomType = strip_tags(str_replace("\n","", str_replace("\r","", $value[0]))); 
          $levels = strip_tags(str_replace("\n","", str_replace("\r","", $value[1]))); 
          $local = strip_tags(str_replace("\n","", str_replace("\r","", $value[2]))); 
          $chinse = strip_tags(str_replace("\n","", str_replace("\r","", $value[3]))); 
          $indian = strip_tags(str_replace("\n","", str_replace("\r","", $value[4]))); 
          $other = strip_tags(str_replace("\n","", str_replace("\r","", $value[5]))); 
          $foreign = strip_tags(str_replace("\n","", str_replace("\r","", $value[6]))); 
          $totalSold = strip_tags(str_replace("\n","", str_replace("\r","", $value[7]))); 
          $totalUnsold = strip_tags(str_replace("\n","", str_replace("\r","", $value[8]))); 
          $totalPerUnit = strip_tags(str_replace("\n","", str_replace("\r","", $value[9]))); 
          $approvedUnits = strip_tags(str_replace("\n","", str_replace("\r","", $value[10]))); 
          $developedUnits = strip_tags(str_replace("\n","", str_replace("\r","", $value[11]))); 
          //echo sizeof($totalPerUnit); 
          $this->Kpkt_model->add_selling_information_temp(
           $company_id, 
           $project_id, 
           $roomType, 
           $levels, 
           $local, 
           $chinse, 
           $indian, 
           $other, 
           $foreign, 
           $totalSold, 
           $totalUnsold, 
           $totalPerUnit, 
           $approvedUnits, 
           $developedUnits 
           ); 
          # code... 
         } 
         $html3->clear(); 
         unset($html3); 
         unset($tag); 

        } 

       } 
      } 
     } 

    } 
// clean the memory 
$html->clear(); 
unset($html); 
unset($tag);   
} 

型號

function insert_company_temp($name, $file, $roc, $address, $city, $postcode, $district, $state, $telephone, $fax, $website, $last_update) 
{ 
    $data = array(
     'file'   => $file, 
     'name'   => $name, 
     'roc'   => $roc, 
     'address'  => $address, 
     'city'   => $city, 
     'postcode'  => $postcode, 
     'district'  => $district, 
     'state'   => $state, 
     'telephone'  => $telephone, 
     'fax'   => $fax, 
     'website'  => $website, 
     'last_update' => $last_update 

    ); 

    $this->db->insert('kpkt_company_temp', $data);  
    //echo $this->db->last_query(); 
} 

function last_id() 
{ 
    $query = "SELECT MAX(id) AS id FROM kpkt_company"; 
    $res = $this->db->query($query); 
    return $res->row('id'); 
    //echo $this->db->last_query(); 

} 

function last_temp_id() 
{ 
    $query = "SELECT MAX(id) AS id FROM kpkt_company_temp"; 
    $res = $this->db->query($query); 
    return $res->row('id'); 
    //echo $this->db->last_query(); 

} 

function add_project_information_temp(
       $developer, 
       $name, 
       $file, 
       $lot_no, 
       $state, 
       $housing_no, 
       $bank_name, 
       $license_no, 
       $license_expire, 
       $permit_no, 
       $permit_expire, 
       $land_status 
       ) 
{ 
    $data = array(
     'developer_id'  => $developer, 
     'name'    => $name, 
     'file'    => $file, 
     'lot_no'   => $lot_no, 
     'state'    => $state, 
     'housing_no'  => $housing_no, 
     'bank_name'   => $bank_name, 
     'license'   => $license_no, 
     'license_expire' => $license_expire, 
     'permit_no'   => $permit_no, 
     'permit_expire'  => $permit_expire, 
     'land_status'  => $land_status 

    ); 

    $this->db->insert('kpkt_project_information_temp', $data);   
} 


function add_project_development_information_temp(
       $developer, 
       $project_id, 
       $category, 
       $type, 
       $storey, 
       $floorArea, 
       $totalArea, 
       $units, 
       $tcf, 
       $cf, 
       $priceMax, 
       $priceStandard, 
       $priceMin, 
       $progressReport 
       ) 
{ 
    $data = array(
     'developer_id'  => $developer, 
     'project_id'  => $project_id, 
     'house_category' => $category, 
     'house_type'  => $type, 
     'levels'   => $storey, 
     'floor_area'  => $floorArea, 
     'total_area'  => $totalArea, 
     'units'    => $units, 
     'tcf'    => $tcf, 
     'cf'    => $cf, 
     'price_max'   => $priceMax, 
     'price_standard' => $priceStandard, 
     'price_min'   => $priceMin, 
     'progress_report' => $progressReport   
    ); 

    $this->db->insert('kpkt_project_development_information_temp', $data);  
} 

function add_selling_information_temp(
       $company_id, 
       $project_id, 
       $roomType, 
       $levels, 
       $local, 
       $chinese, 
       $indian, 
       $other, 
       $foreign, 
       $totalSold, 
       $totalUnsold, 
       $totalPerUnit, 
       $approvedUnits, 
       $developedUnits 
       ) 
{ 
    $data = array(
     'developer_id' => $company_id, 
     'project_id' => $project_id, 
     'house_type' => $roomType, 
     'levels'  => $levels, 
     'bumi'   => $local, 
     'chinese'  => $chinese, 
     'indian'  => $indian, 
     'other'   => $other, 
     'foreigner'  => $foreign, 
     'units_sold' => $totalSold, 
     'units_unsold' => $totalUnsold, 
     'price_per_unit'=> $totalPerUnit, 
     'approved_units'=> $approvedUnits, 
     'developed_units'=> $developedUnits   
    ); 

    $this->db->insert('kpkt_selling_information_temp', $data); 
} 

所以再次只是爲了澄清,一切都運行速度非常流暢,沒有任何問題,除了INSERT語句繼續落後這麼說。我怎樣才能在function start()中調節while? 謝謝

+1

說真的,誰會讀所有這些?如何粘貼代碼的相關部分? – Rob 2013-03-28 02:01:04

+0

展示您的問題所需的最小代碼量是多少? – Patashu 2013-03-28 02:01:19

+0

我試圖警告有關的大小:) @Patashu讓我修剪請 – 2013-03-28 02:02:19

回答

2

你正在使用usleep,這是一個微秒的暫停; http://php.net/manual/en/function.usleep.php

你有睡覺(100),暫停幾乎不明顯。

也許使用;睡覺(1)或將睡眠時間調到100000 = 0.1秒

+0

我在開始時使用'usleep(500000)',但仍遇到同樣的問題。 – 2013-03-28 02:09:06

+0

仍然只有0.5秒的停頓。嘗試暫停1-2秒,看看它是否趕上了 – 2013-03-28 02:09:45

+0

使用值超過2秒的作品,謝謝 – 2013-03-28 02:42:33