2011-10-01 168 views
3

我是一個使用簡單的HTML DOM分析器的一堆子頁面的屏幕抓取頁面。出於某種原因,它解析了前40個子頁面,但是當涉及到41號時,它沒有錯誤地死亡。PHP簡單的HTML DOM分析器模

我製作了this test page,並嘗試記錄我在腳本中所做的所有操作以及Simple DOM DOM解析器中的一些venet,但是我一直無法找到該錯誤。

有沒有人有一個想法,當解析URL號碼41?或者是否有人知道一些情況簡單的HTML DOM解析器會失敗?

我的測試頁:http://snuzzer.dk/pub/shdp/parse.php

這是我的劇本,我用簡單的HTML DOM解析器的非修改後的版本。 有趣的東西發生在get_lections()和我有市場,我稱之爲簡單的HTML DOM解析器。

define("LECTION_STATUS_REGULAR", 0); 
define("LECTION_STATUS_CHANGED", 1); 
define("LECTION_STATUS_CANCELLED", 2); 

define("LECTION_DOCUMENTS_NONE", 0); 
define("LECTION_DOCUMENTS_TRUE", 1); 

define("AMOUNT_OF_WEEKS_IN_A_YEAR", 52); 

include_once("simple_html_dom.php"); 

function clean_text($text) 
{ 
    $text = trim($text); 
    $text = strip_tags($text); 
    $text = html_entity_decode($text, ENT_QUOTES, "UTF-8"); 
    $text = utf8_decode($text); 

    return $text; 
} 

function get_links_for_lections($weeks) 
{ 
    echo "Finding links<br /><textarea style=\"width:70%;height:150px;\">"; 

    foreach($weeks as $week) 
    { 
     // ** 
     // 
     // THIS IS WHERE I CALL SIMPLE HTML DOM PARSER 
     // 
     // ** 

     echo " * Retrieving HTML...\n"; 
     $html = file_get_html("http://www.lectio.dk/lectio/285/SkemaNy.aspx?type=elev&elevid=2444366210&week=" . $week['week'] . $week['year']); 
     echo " * HTML retrieved...\n"; 

     $lections_regular = $html->find('a[class="s2skemabrik s2bgbox s2withlink"]'); 
     $lections_changed = $html->find('a[class="s2skemabrik s2bgbox s2changed s2withlink"]'); 
     $lections_cancelled = $html->find('a[class="s2skemabrik s2bgbox s2cancelled s2withlink"]'); 
     $lections = array_merge($lections_regular, $lections_changed, $lections_cancelled); 

     foreach($lections as $lection) 
     { 
      $links[] = "http://www.lectio.dk" . $lection->href; 
     } 
    } 

    echo "</textarea> 
    <hr />"; 

    return $links; 
} 

function get_lections($links) 
{ 
    // Create array to hold lections 
    $lections = array(); 

    // Loop through links 
    $num = 1; 
    foreach($links as $link) 
    { 
     echo $num . ". " . $link . "<br /> 
     <textarea style=\"width:70%;height:150px;\">"; 

     // Initialize lection 
     $lection = array(); 
     $lection['status'] = LECTION_STATUS_REGULAR; 
     $lection['documents'] = LECTION_DOCUMENTS_NONE; 

     echo " * Retrieving HTML...\n"; 
     $html = file_get_html($link); 
     echo " * HTML retrieved\n"; 

     // Loop through rows 
     foreach($html->find("tr") as $row) 
     { 
      echo " * New cell\n"; 

      // Get name of row 
      $row_name = $row->find("th"); 
      $row_name = $row_name['0']->innertext; 

      echo " - Row name: \"" . $row_name . "\"\n"; 

      if ($row_name == "Type:") 
      { 
       echo " - Checking type...\n"; 

       // Row tells what type it is 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['type'] = clean_text($content); 

       echo " - Type checked\n"; 
      } 
      else if ($row_name == "Titel:") 
      { 
       echo " - Checking title...\n"; 

       // Row tells the title 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['title'] = clean_text($content); 

       echo " - Title checked\n"; 
      } 
      else if ($row_name == "Hold:") 
      { 
       echo " - Checking subject...\n"; 

       // Row tells what the subject is 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['subject'] = clean_text($content); 

       echo " - Subject checked\n"; 
      } 
      else if ($row_name == "Lærere:") 
      { 
       echo " - Checking teachers...\n"; 

       // Row tells who the teacher is 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['teachers'] = clean_text($content); 

       echo " - Teachers checked\n"; 
      } 
      else if ($row_name == "Lokaler:") 
      { 
       echo " - Checking location...\n"; 

       // Row tells the location 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['location'] = clean_text($content); 

       echo " - Location checked\n"; 
      } 
      else if ($row_name == "Note:") 
      { 
       echo " - Checking note...\n"; 

       // Row contains a note 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['note'] = clean_text($content); 

       echo " - Note checked\n"; 
      } 
      elseif ($row_name == "Dokumenter:") 
      { 
       echo " - Checking documents...\n"; 

       // Row contains the documents 
       $cell = $row->find("td"); 
       $content = $cell['0']->plaintext; 
       $content = clean_text($content); 
       if ($content) 
       { 
        // We can't get the titles of the documents as we are not logged in 
        // Instead we tell the user that there are documents available 
        $lection['documents'] = LECTION_DOCUMENTS_TRUE; 
       } 

       echo " - Documents checked\n"; 
      } 
      else if ($row_name == "Lektier:") 
      { 
       echo " - Checking homework...\n"; 

       // Row contains the homework 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['homework'] = clean_text($content); 

       echo " - Homework checked\n"; 
      } 
      else if ($row_name == "Vises:") 
      { 
       echo " - Checking status (part 1)...\n"; 

       // Row tells where the lection is shown 
       $cell = $row->find("td"); 
       $content = $cell['0']->plaintext; 
       $content = clean_text($content); 
       if (strstr($content, ",")) 
       { 
        // If the above is true, the lection is NOT REGULAR 
        // Now we know that the lection is either changed or cancellde 
        // We assume it is changed 
        // Below we check if the lection is cancelled (Where $row_namme == "Status:") 
        $lection['status'] = LECTION_STATUS_CHANGED; 
       } 

       echo " - Status (part 1) checked\n"; 
      } 
     } 

     // Add lection to array of lections 
     $lections[] = $lection; 
     print_r($lection); 

     echo " - Lection added!</textarea><br /><br />"; 

     $num += 1; 
    } 

    return $lections; 
} 

function get_weeks($amount_of_weeks) 
{ 
    $weeks = array(); 

    // Current week 
    $week_now = date('W'); 
    $year_now = date('Y'); 

    // Demo 
    $week_now = 44; 

    // Last week to fetch 
    $last_week = $week_now + $amount_of_weeks; 

    // Add weeks to array 
    for ($i = $week_now; $i <= $last_week; $i++) 
    { 
     $week = array(); 

     if ($i > AMOUNT_OF_WEEKS_IN_A_YEAR) 
     { 
      // Week is next year 
      $week['week'] = $i - AMOUNT_OF_WEEKS_IN_A_YEAR; 
      $week['year'] = $year_now + 1; 
     } 
     else 
     { 
      // Week is in this year 
      $week['week'] = $i; 
      $week['year'] = $year_now; 
     } 

     // Add week to weeks 
     $weeks[] = $week; 
    } 

    return $weeks; 
} 

$weeks = get_weeks(5); 
$links = get_links_for_lections($weeks); 
$lections = get_lections($links); 
echo "<hr />"; 
print_r($lections); 
echo "<hr />"; 

回答

1

我跑這個,它工作正常,我起來了96.如果我不得不猜測我會說你達到最大excution時間。嘗試在頂部添加:set_time_limit(0); 否則,請嘗試更改錯誤報告並在此處發佈任何錯誤。

+1

它適合你嗎?真奇怪。設置'set_time_limit(0)'沒有改變任何東西。我想我的webhotel不允許我這樣做。它看起來並不像超時,因爲它不會很長時間加載。我也覺得奇怪的是,如果是超時,它每次都停在同一個地方。 – simonbs

+0

我剛剛在另一臺服務器上測試過。你是對的,似乎最大的執行時間已經達到。 – simonbs