3
我是一個使用簡單的HTML DOM分析器的一堆子頁面的屏幕抓取頁面。出於某種原因,它解析了前40個子頁面,但是當涉及到41號時,它沒有錯誤地死亡。PHP簡單的HTML DOM分析器模
我製作了this test page,並嘗試記錄我在腳本中所做的所有操作以及Simple DOM DOM解析器中的一些venet,但是我一直無法找到該錯誤。
有沒有人有一個想法,當解析URL號碼41?或者是否有人知道一些情況簡單的HTML DOM解析器會失敗?
我的測試頁:http://snuzzer.dk/pub/shdp/parse.php
這是我的劇本,我用簡單的HTML DOM解析器的非修改後的版本。 有趣的東西發生在get_lections()和我有市場,我稱之爲簡單的HTML DOM解析器。
define("LECTION_STATUS_REGULAR", 0);
define("LECTION_STATUS_CHANGED", 1);
define("LECTION_STATUS_CANCELLED", 2);
define("LECTION_DOCUMENTS_NONE", 0);
define("LECTION_DOCUMENTS_TRUE", 1);
define("AMOUNT_OF_WEEKS_IN_A_YEAR", 52);
include_once("simple_html_dom.php");
function clean_text($text)
{
$text = trim($text);
$text = strip_tags($text);
$text = html_entity_decode($text, ENT_QUOTES, "UTF-8");
$text = utf8_decode($text);
return $text;
}
function get_links_for_lections($weeks)
{
echo "Finding links<br /><textarea style=\"width:70%;height:150px;\">";
foreach($weeks as $week)
{
// **
//
// THIS IS WHERE I CALL SIMPLE HTML DOM PARSER
//
// **
echo " * Retrieving HTML...\n";
$html = file_get_html("http://www.lectio.dk/lectio/285/SkemaNy.aspx?type=elev&elevid=2444366210&week=" . $week['week'] . $week['year']);
echo " * HTML retrieved...\n";
$lections_regular = $html->find('a[class="s2skemabrik s2bgbox s2withlink"]');
$lections_changed = $html->find('a[class="s2skemabrik s2bgbox s2changed s2withlink"]');
$lections_cancelled = $html->find('a[class="s2skemabrik s2bgbox s2cancelled s2withlink"]');
$lections = array_merge($lections_regular, $lections_changed, $lections_cancelled);
foreach($lections as $lection)
{
$links[] = "http://www.lectio.dk" . $lection->href;
}
}
echo "</textarea>
<hr />";
return $links;
}
function get_lections($links)
{
// Create array to hold lections
$lections = array();
// Loop through links
$num = 1;
foreach($links as $link)
{
echo $num . ". " . $link . "<br />
<textarea style=\"width:70%;height:150px;\">";
// Initialize lection
$lection = array();
$lection['status'] = LECTION_STATUS_REGULAR;
$lection['documents'] = LECTION_DOCUMENTS_NONE;
echo " * Retrieving HTML...\n";
$html = file_get_html($link);
echo " * HTML retrieved\n";
// Loop through rows
foreach($html->find("tr") as $row)
{
echo " * New cell\n";
// Get name of row
$row_name = $row->find("th");
$row_name = $row_name['0']->innertext;
echo " - Row name: \"" . $row_name . "\"\n";
if ($row_name == "Type:")
{
echo " - Checking type...\n";
// Row tells what type it is
$cell = $row->find("td");
$content = $cell['0']->innertext;
$lection['type'] = clean_text($content);
echo " - Type checked\n";
}
else if ($row_name == "Titel:")
{
echo " - Checking title...\n";
// Row tells the title
$cell = $row->find("td");
$content = $cell['0']->innertext;
$lection['title'] = clean_text($content);
echo " - Title checked\n";
}
else if ($row_name == "Hold:")
{
echo " - Checking subject...\n";
// Row tells what the subject is
$cell = $row->find("td");
$content = $cell['0']->innertext;
$lection['subject'] = clean_text($content);
echo " - Subject checked\n";
}
else if ($row_name == "Lærere:")
{
echo " - Checking teachers...\n";
// Row tells who the teacher is
$cell = $row->find("td");
$content = $cell['0']->innertext;
$lection['teachers'] = clean_text($content);
echo " - Teachers checked\n";
}
else if ($row_name == "Lokaler:")
{
echo " - Checking location...\n";
// Row tells the location
$cell = $row->find("td");
$content = $cell['0']->innertext;
$lection['location'] = clean_text($content);
echo " - Location checked\n";
}
else if ($row_name == "Note:")
{
echo " - Checking note...\n";
// Row contains a note
$cell = $row->find("td");
$content = $cell['0']->innertext;
$lection['note'] = clean_text($content);
echo " - Note checked\n";
}
elseif ($row_name == "Dokumenter:")
{
echo " - Checking documents...\n";
// Row contains the documents
$cell = $row->find("td");
$content = $cell['0']->plaintext;
$content = clean_text($content);
if ($content)
{
// We can't get the titles of the documents as we are not logged in
// Instead we tell the user that there are documents available
$lection['documents'] = LECTION_DOCUMENTS_TRUE;
}
echo " - Documents checked\n";
}
else if ($row_name == "Lektier:")
{
echo " - Checking homework...\n";
// Row contains the homework
$cell = $row->find("td");
$content = $cell['0']->innertext;
$lection['homework'] = clean_text($content);
echo " - Homework checked\n";
}
else if ($row_name == "Vises:")
{
echo " - Checking status (part 1)...\n";
// Row tells where the lection is shown
$cell = $row->find("td");
$content = $cell['0']->plaintext;
$content = clean_text($content);
if (strstr($content, ","))
{
// If the above is true, the lection is NOT REGULAR
// Now we know that the lection is either changed or cancellde
// We assume it is changed
// Below we check if the lection is cancelled (Where $row_namme == "Status:")
$lection['status'] = LECTION_STATUS_CHANGED;
}
echo " - Status (part 1) checked\n";
}
}
// Add lection to array of lections
$lections[] = $lection;
print_r($lection);
echo " - Lection added!</textarea><br /><br />";
$num += 1;
}
return $lections;
}
function get_weeks($amount_of_weeks)
{
$weeks = array();
// Current week
$week_now = date('W');
$year_now = date('Y');
// Demo
$week_now = 44;
// Last week to fetch
$last_week = $week_now + $amount_of_weeks;
// Add weeks to array
for ($i = $week_now; $i <= $last_week; $i++)
{
$week = array();
if ($i > AMOUNT_OF_WEEKS_IN_A_YEAR)
{
// Week is next year
$week['week'] = $i - AMOUNT_OF_WEEKS_IN_A_YEAR;
$week['year'] = $year_now + 1;
}
else
{
// Week is in this year
$week['week'] = $i;
$week['year'] = $year_now;
}
// Add week to weeks
$weeks[] = $week;
}
return $weeks;
}
$weeks = get_weeks(5);
$links = get_links_for_lections($weeks);
$lections = get_lections($links);
echo "<hr />";
print_r($lections);
echo "<hr />";
它適合你嗎?真奇怪。設置'set_time_limit(0)'沒有改變任何東西。我想我的webhotel不允許我這樣做。它看起來並不像超時,因爲它不會很長時間加載。我也覺得奇怪的是,如果是超時,它每次都停在同一個地方。 – simonbs
我剛剛在另一臺服務器上測試過。你是對的,似乎最大的執行時間已經達到。 – simonbs