2014-01-31 63 views
0

我有一個腳本,它具有一個網址列表,我從這些URL獲取信息,如名稱,市,部等PHP - Web抓取 - 如何使用cURL緩存?

這些都是我的一些功能:

function getCity($url) 
    { 
    $url = curl_get_contents($url); 
    $html_object = str_get_html($url); 
    return $ret = $html_object->find('td', 86)->plaintext; 
    } 

function getDepartment($url) 
    { 
    $url = curl_get_contents($url); 
    $html_object = str_get_html($url); 
    return $ret = $html_object->find('td', 90)->plaintext; 
    } 

function getSalary($url) 
    { 
    $url = curl_get_contents($url); 
    $html_object = str_get_html($url); 
    $ret = $html_object->find('td', 94)->plaintext; 
    return trim($ret); 
    } 

這是我的cURL代碼:

function curl_get_contents($url) 
{ 
    $curl_moteur = curl_init(); 
    curl_setopt($curl_moteur, CURLOPT_URL, $url); 
    curl_setopt($curl_moteur, CURLOPT_RETURNTRANSFER, 1); 

    curl_setopt($curl_moteur,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); 

    curl_setopt($curl_moteur, CURLOPT_FOLLOWLOCATION, 1); 
    $web = curl_exec($curl_moteur); 
    curl_close($curl_moteur); 
    return $web; 
} 

正如您所看到的,我正在爲每個字段提出請求,這是非常低效的。 我想要實現一個緩存,以提取所有請求每個url只需一次的信息字段。

在此先感謝。

+0

只是保存在一個文件中的$網絡收益,並在需要的時候打開它。 – Kei

回答

0

您可以從您的函數的類,像這樣:

class Scrapper 
{ 
    public $page_content; 

    public $html_object; 

    public function __construct($url) 
    { 
     $this->page_content = $this->curl_get_contents($url); //in case you want to keep for something scrapped url content 
     $this->html_object = $this->str_get_html($this->page_content); //create object from html, probably simpleXML 
    } 

    public function getCity() 
    { 
     return $this->html_object->find('td', 86)->plaintext; 
    } 

    public function getDepartment() 
    { 
     return $this->html_object->find('td', 90)->plaintext; 
    } 

    public function getSalary() 
    { 

     $ret = $this->html_object->find('td', 94)->plaintext; 
     return trim($ret); 
    } 

    public function curl_get_contents($url) 
    { 
     $curl_moteur = curl_init(); 
     curl_setopt($curl_moteur, CURLOPT_URL, $url); 
     curl_setopt($curl_moteur, CURLOPT_RETURNTRANSFER, 1); 

     curl_setopt($curl_moteur,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); 

     curl_setopt($curl_moteur, CURLOPT_FOLLOWLOCATION, 1); 
     $web = curl_exec($curl_moteur); 
     curl_close($curl_moteur); 
     return $web; 
    } 

    public function str_get_html() 
    { 
     //unkown function content 
     $this->html_object = $some_object; // $some_object = str_get_html($url) from your code; 
    } 
} 

$scrapper = new Scrapper($your_url); 

echo $scrapper->getCity(); 
echo $scrapper->getDepartment(); 

注意,代碼是未經測試。

這樣,您在實例化類時請求URL一次。

,或者如果你不想使用對象的一個​​簡單的辦法是使用static變量:

function curl_get_contents($url) 
{ 
    static $web = null; 
    if (!is_null($web)) { 
    return $web; 
    } 

    $curl_moteur = curl_init(); 
    curl_setopt($curl_moteur, CURLOPT_URL, $url); 
    curl_setopt($curl_moteur, CURLOPT_RETURNTRANSFER, 1); 

    curl_setopt($curl_moteur,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); 

    curl_setopt($curl_moteur, CURLOPT_FOLLOWLOCATION, 1); 
    $web = curl_exec($curl_moteur); 
    curl_close($curl_moteur); 
    return $web; 
}