2016-03-15 49 views
-1

我有一個輸入文本文件中像下面來提取文本文件的塊:PHP - 如何通過閱讀它一行一行地

BEGIN 
#1 
#2 
#3 
#4 
#5 
#6 
1  2015-05-31 2001-11-24 'Name Surname'  ID_1  0 
2  2011-04-01 ?   ?     ID_2  1 
2  2013-02-24 ?   ?     ID_3  1 
2  2014-02-28 ?   'Name Surname'  ID_4  2 
END 
#7  'value 1' 
#8  'value 2' 
#9  'value 3' 
#10  'value 4' 
END 

當在文本文件中,有一個BEGIN,從那裏開始一個循環,其中每個與#開始行是一個鍵,而相對值分別以下行的列,直到END,生成像以下的數組:在文本文件

Array ([#1] => Array ([0] => 1 [1] => 2 [2] => 2 [3] => 2) [#2] => Array ([0] => 2015-05-31 [1] => 2011-04-01 [2] => 2013-02-24 [3] => 2014-02-28) [#3] => Array ([0] => 2001-11-24 [1] => ? [2] => ? [3] => ?) [#4] => Array ([0] => 'Name Surname' [1] => ? [2] => ? [3] => 'Name Surname') [#5] => Array ([0] => ID_1 [1] => ID_2 [2] => ID_3 [3] => ID_4) [#6] => Array ([0] => 0 [1] => 1 [2] => 1 [3] => 2)) 

否則,如果沒有BEGIN,但你會發現,與#啓動線,其相對值是單引號之間的一個,產生類似如下的數組:

Array ([#7] => 'value 1' [#8] => 'value 2' [#9] => 'value 3' [#10] => 'value 4') 

這是我獲得的,而我當前的代碼以下內容:

<?php 
    $time = microtime(); 
    $time = explode(' ', $time); 
    $time = $time[1] + $time[0]; 
    $start = $time; 

    ini_set("max_execution_time", 300); // 300 seconds = 5 minutes 
    ini_set("pcre.backtrack_limit", "100000000"); // default 100k = "100000" 
    ini_set("memory_limit", "1024M"); 

    $txt_path = "./test_2.txt"; 
    $txt_data = @file_get_contents($txt_path) or die("Could not access file: $txt_path"); 
    //echo $txt_data; 

    /* BEGIN ARRAY FOR LOOP ENTRIES */ 

    $loop_pattern = "/BEGIN(.*?)END/s"; 
    preg_match_all($loop_pattern, $txt_data, $matches); 
    $loops = $matches[0]; 
    $loops_count = count($loops); 
    //echo("<br><br>".$loops_count."<br><br>"); 

    foreach ($loops as $key => $value) { 
     $value = trim($value); 
     $pattern = array("/BEGIN(.*?)/", "/END(.*?)/", "/[[:blank:]]+/"); 
     $replacement = array("", "", " "); 
     $value = preg_replace($pattern, $replacement, $value); 
     //echo $value."<br><br>"; 

     preg_match_all('/^#\d+/m', $value, $matches); 
     $keys = $matches[0]; 
     //print_r($keys); 
     //echo "<br><br>"; 

     $value = preg_replace('/^#\d+\s*/m', '', $value); 

     $value = str_replace("\n", " ", $value); 

     $pattern = '/'.str_repeat("('[^']+'|\S+)\s+", count($keys)).'/'; 

     preg_match_all($pattern, $value, $matches); 
     //print_r($matches); 
     //echo "<br><br>"; 

     $loop_dic = array_combine($keys, array_slice($matches, 1)); 

     print_r($loop_dic); 
     echo("<br><br>"); 
    } 

    /* END ARRAY FOR LOOP ENTRIES */ 

    /* BEGIN ARRAY FOR NO LOOP ENTRIES */ 

    $txt_data_without_loops = preg_replace("/BEGIN(.*?)END/s", "", $txt_data); 
    //echo $txt_data_without_loops; 

    $pattern = array("/END(.*?)/", "/[[:blank:]]+/"); 
    $replacement = array("", " "); 
    $txt_data_without_loops_clean = preg_replace($pattern, $replacement, $txt_data_without_loops); 
    //echo $txt_data_without_loops_clean; 
    preg_match_all('/^#(.*?)\S+/m', $txt_data_without_loops_clean, $matches); 
    $keys = $matches[0]; 
    //print_r($keys); 
    $txt_data_without_loops_clean = preg_replace('/^#(.*?)\S+\s*/m', '', $txt_data_without_loops_clean); 
    //print_r($txt_data_without_loops_clean); 

    $txt_data_without_loops_clean_no_newline = str_replace("\n", " ", $txt_data_without_loops_clean); 
    //print_r($txt_data_without_loops_clean_no_newline); 
    $pattern = '/'.str_repeat("('[^']+'|\S+)\s+", 1).'/'; 
    preg_match_all($pattern, $txt_data_without_loops_clean_no_newline, $matches); 
    //print_r($matches[0]); 

    $no_loop_dic = array_combine($keys, $matches[0]); 
    print_r($no_loop_dic); 
    echo("<br><br>"); 

    /* END ARRAY FOR NO LOOP ENTRIES */ 

    $time = microtime(); 
    $time = explode(' ', $time); 
    $time = $time[1] + $time[0]; 
    $finish = $time; 
    $total_time = round(($finish - $start), 4); 
    echo '<br><br><b>Page generated in '.$total_time.' seconds.</b><br><br>'; 
?> 

作爲第一種方法,爲了獲得BEGIN-END環路和相對陣列,我讀取與輸入的文件:

$txt_path = "./input.txt"; 
$txt_data = @file_get_contents($txt_path) or die("<b>Could not access file: $txt_path</b><br><br>"); 

的作品音響ne用於小文件,但是,在輸入文件很大的情況下,它會在瀏覽器中產生不響應的時間(我正在測試Firefox),也許是爲了解析整個大文件(我的筆記本電腦有3GB內存) 。

我試圖在PHP文件如下設置:

ini_set("max_execution_time", 300); // 300 seconds = 5 minutes 
ini_set("pcre.backtrack_limit", "100000000"); // default 100k = "100000" 
ini_set("memory_limit", "1024M"); 

,這似乎解決了一些文件的大小沒有這麼大的問題,同時,與大文件,進程已經完成沒有錯誤只沒有很多資源在同一時間使用......所以,這不是最好的解決方案。

在網絡上搜索,我發現this page在那裏我閱讀:

如果你正在讀文件,讀他們行由行而不是在 完整的文件讀入內存中。看看fgetsSplFileObject::fgets

所以我決定使用fgets來讀取和解析整個輸入文件。 在爲所有行生成數組後,我需要從每個循環中提取數據,並將其添加到loops_array中,同時將其他no_loop鍵值對添加到另一個數組中。

我的想法,這似乎是快速,就是要找到每個BEGIN的指數,以這樣的方式

$txt_path = "./test.txt"; 
$txt_data = @fopen($txt_path, "rb") or die("<b>Could not access file: $txt_path</b><br/><br/>"); 

$lines = array(); 
while (!feof($txt_data)) { 
    $line = fgets($txt_data, 1024); 
    //echo($line."<br/><br/>"); 
    array_push($lines, trim($line)); 
} 

$lines = array_filter($lines); 
//print_r($lines); 
//echo("<br/><br/>"); 

$begins = array_keys($lines, "BEGIN"); 
//echo("<b>Begins:</b><br/><br/>"); 
//print_r($begins); 
//echo("<br/><br/>"); 

但現在我需要在每一個元素後找到的第END的索引$begins數組...如果我做的:

$ends = array_keys($lines, "END"); 
//echo("<b>Ends:</b><br/><br/>"); 
//print_r($ends); 
//echo("<br/><br/>"); 

也認爲END字符串輸入文件的no_loop區,而我應該找到一個END字符串的第一個匹配的索引,每個BEGIN後,與隨後將它們結合起來:

$begins_ends = array_combine($begins, $ends); 

,並提取所有與array_slice的循環,最後加入各$loop到一個新的數組,$loops,在某種程度上像這樣的:

$i = 0; 
$loops = array(); 
foreach ($begins_ends as $key => $value) { 
    $begin = trim($key); 
    $end = trim($value); 
    $loop = array_slice($lines, $begin, ($end - $begin), false); 
    $this_loop = array(); 
    for ($el=$begin; $el < $end+1; $el++) { 
     array_push($this_loop, $lines[$el]); 
     unset($lines[$el]); 
    } 
    array_push($loops, $this_loop); 
    $loop = array_values($lines); 
    //echo("<b>Loops Dictionary $i:</b><br/><br/>"); 
    //print_r($loop); 
    //echo("<br/><br/>"); 
    $i++; 
} 
//print_r($loops); 
//echo("<br/><br/>"); 

的問題是獲得正確的$ends陣列,在不考慮END串中的輸入文件中的no_loop區域,獲得先前的輸出:

Array ([#1] => Array ([0] => 1 [1] => 2 [2] => 2 [3] => 2) [#2] => Array ([0] => 2015-05-31 [1] => 2011-04-01 [2] => 2013-02-24 [3] => 2014-02-28) [#3] => Array ([0] => 2001-11-24 [1] => ? [2] => ? [3] => ?) [#4] => Array ([0] => 'Name Surname' [1] => ? [2] => ? [3] => 'Name Surname') [#5] => Array ([0] => ID_1 [1] => ID_2 [2] => ID_3 [3] => ID_4) [#6] => Array ([0] => 0 [1] => 1 [2] => 1 [3] => 2)) 

Array ([#7] => 'value 1' [#8] => 'value 2' [#9] => 'value 3' [#10] => 'value 4') 

以最快的方法和最低存儲器使用,在文件大的瀏覽器中解決瀏覽器不響應的問題。

謝謝

+2

你需要澄清你的問題是什麼,並且會降低代碼爲[MCVE。另外,如果你想逐行處理一個文本文件,你可能需要查看'file()'命令。 – miken32

+0

簡單的投票,但會更好的閱讀和找到解決方案。我正在尋找所發佈問題的答案:什麼不明確? –

回答

0

它只是有用的說,這是沒有必要使用fgets(),但fread();該信息的來源是here

正如你可以閱讀那裏,file()是非常類似於以前使用的file_get_contents(),所以它應該沒有區別。

以前工作的代碼應該適合在一個如此簡單的方式:

  • test_2.txt文件內容:

BEGIN 
#1 
#2 
#3 
#4 
#5 
#6 
1  2015-05-31 2001-11-24 'Name Surname'  ID_1  0 
2  2011-04-01 ?   ?     ID_2  1 
2  2013-02-24 ?   ?     ID_3  1 
2  2014-02-28 ?   'Name Surname'  ID_4  2 
END 
#7  'value 1' 
#8  'value 2' 
#9  'value 3' 
#10  'value 4' 
END 
BEGIN 
#11 
#12 
#13 
#14 
#15 
#16 
1  2015-05-31 2001-11-24 'Name Surname'  ID_5  0 
2  2011-04-01 ?   ?     ID_6  1 
2  2013-02-24 ?   ?     ID_7  1 
2  2014-02-28 ?   'Name Surname'  ID_8  2 
END 
BEGIN 
#17 
#18 
#19 
#20 
#21 
#22 
1  2015-05-31 2001-11-24 'Name Surname'  ID_9  0 
2  2011-04-01 ?   ?     ID_10  1 
2  2013-02-24 ?   ?     ID_11  1 
2  2014-02-28 ?   'Name Surname'  ID_12  2 
END 
  • PHP代碼:

<?php 
$time = microtime(); 
$time = explode(" ", $time); 
$time = $time[1] + $time[0]; 
$start = $time; 

$filename = "./test_2.txt"; 
$handle = fopen($filename, "rb") or die("<b>Could not access file: $filename</b><br/><br/>"); 
$contents = fread($handle, filesize($filename)); 
fclose($handle); 

//echo($contents."<br><br>"); 

$loop_pattern = "/BEGIN(.*?)END/s"; 
preg_match_all($loop_pattern, $contents, $matches); 
$loops = $matches[0]; 
//print_r($loops); 
//echo("<br><br>"); 
$loops_count = count($loops); 
//print_r($loops_count); 
//echo "<br><br>"; 

foreach ($loops as $key => $value) { 
    $value = trim($value); 
    //echo($value."<br><br>"); 
    $pattern = array("/[[:blank:]]+/", "/BEGIN(.*)/", "/END(.*)/"); 
    $replacement = array(" ", "", ""); 
    $value = preg_replace($pattern, $replacement, $value); 
    //echo($value."<br><br>"); 

    preg_match_all('/^#\d+/m', $value, $matches); 
    $keys = $matches[0]; 
    //print_r($keys); 
    //echo "<br><br>"; 

    $value = preg_replace('/^#\d+\s*/m', '', $value); 

    $value = str_replace("\n", " ", $value); 

    $pattern = '/'.str_repeat("('[^']+'|\S+)\s+", count($keys)).'/'; 
    preg_match_all($pattern, $value, $matches); 
    //print_r($matches); 
    //echo "<br><br>"; 

    $values = array_combine($keys, array_slice($matches, 1, count($keys), false)); 
    print_r($values); 
    echo "<br><br>"; 
} 

$time = microtime(); 
$time = explode(" ", $time); 
$time = $time[1] + $time[0]; 
$finish = $time; 
$total_time = round(($finish - $start), 4); 
echo("<br/><br/><b>Page generated in ".$total_time." seconds.</b><br/><br/>"); 
?> 

我也刪除@,寫作:

fopen($filename, "rb") or die("<b>Could not access file: $filename</b><br/><br/>"); 

,而不是以前:

@fopen($txt_path, "rb") or die("<b>Could not access file: $txt_path</b><br/><br/>"); 

的建議here


編輯1

另一種方法是:

$txt_path = "./test_2.txt"; 
$handle = new SplFileObject($txt_path); 

// Loop until we reach the end of the file. 
$lines_array = array(); 
while (!$handle->eof()) { 
    $line = $handle->fgets(); 
    //echo($line."<br/><br/>"); // Echo one line from the file. 
    array_push($lines_array, trim($line)); 
} 

// Unset the file to call __destruct(), closing the file handle. 
$handle = null; 

$lines_array = array_filter($lines_array); 
//print_r($lines_array); 
//echo("<br/><br/>"); 

$lines_joined = implode("\n", $lines_array); 
//echo($lines_joined."<br/><br/>");