2013-10-21 167 views
39

可能有一種情況,我們需要從Word文檔中獲取文本以供將來使用,以便在用戶上傳的文檔中搜索字符串在cv的/簡歷中搜索,發生如何獲取文本,打開和閱讀用戶上傳的Word文檔的常見問題,有一些有用的鏈接,但不能解決整個問題。我們需要在當時獲取文本上傳並保存數據庫中的文本,我們可以輕鬆地在數據庫中進行搜索。如何從word文件中提取文本.doc,docx,.xlsx,.pptx php

回答

55

這是一個簡單的類,它爲.doc/.docx, PHP docx reader: Convert MS Word Docx files to text執行正確的工作。

class DocxConversion{ 
    private $filename; 

    public function __construct($filePath) { 
     $this->filename = $filePath; 
    } 

    private function read_doc() { 
     $fileHandle = fopen($this->filename, "r"); 
     $line = @fread($fileHandle, filesize($this->filename)); 
     $lines = explode(chr(0x0D),$line); 
     $outtext = ""; 
     foreach($lines as $thisline) 
      { 
      $pos = strpos($thisline, chr(0x00)); 
      if (($pos !== FALSE)||(strlen($thisline)==0)) 
       { 
       } else { 
       $outtext .= $thisline." "; 
       } 
      } 
     $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\[email protected]\/\_\(\)]/","",$outtext); 
     return $outtext; 
    } 

    private function read_docx(){ 

     $striped_content = ''; 
     $content = ''; 

     $zip = zip_open($this->filename); 

     if (!$zip || is_numeric($zip)) return false; 

     while ($zip_entry = zip_read($zip)) { 

      if (zip_entry_open($zip, $zip_entry) == FALSE) continue; 

      if (zip_entry_name($zip_entry) != "word/document.xml") continue; 

      $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); 

      zip_entry_close($zip_entry); 
     }// end while 

     zip_close($zip); 

     $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content); 
     $content = str_replace('</w:r></w:p>', "\r\n", $content); 
     $striped_content = strip_tags($content); 

     return $striped_content; 
    } 

/************************excel sheet************************************/ 

function xlsx_to_text($input_file){ 
    $xml_filename = "xl/sharedStrings.xml"; //content file name 
    $zip_handle = new ZipArchive; 
    $output_text = ""; 
    if(true === $zip_handle->open($input_file)){ 
     if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){ 
      $xml_datas = $zip_handle->getFromIndex($xml_index); 
      $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); 
      $output_text = strip_tags($xml_handle->saveXML()); 
     }else{ 
      $output_text .=""; 
     } 
     $zip_handle->close(); 
    }else{ 
    $output_text .=""; 
    } 
    return $output_text; 
} 

/*************************power point files*****************************/ 
function pptx_to_text($input_file){ 
    $zip_handle = new ZipArchive; 
    $output_text = ""; 
    if(true === $zip_handle->open($input_file)){ 
     $slide_number = 1; //loop through slide files 
     while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){ 
      $xml_datas = $zip_handle->getFromIndex($xml_index); 
      $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); 
      $output_text .= strip_tags($xml_handle->saveXML()); 
      $slide_number++; 
     } 
     if($slide_number == 1){ 
      $output_text .=""; 
     } 
     $zip_handle->close(); 
    }else{ 
    $output_text .=""; 
    } 
    return $output_text; 
} 


    public function convertToText() { 

     if(isset($this->filename) && !file_exists($this->filename)) { 
      return "File Not exists"; 
     } 

     $fileArray = pathinfo($this->filename); 
     $file_ext = $fileArray['extension']; 
     if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx") 
     { 
      if($file_ext == "doc") { 
       return $this->read_doc(); 
      } elseif($file_ext == "docx") { 
       return $this->read_docx(); 
      } elseif($file_ext == "xlsx") { 
       return $this->xlsx_to_text(); 
      }elseif($file_ext == "pptx") { 
       return $this->pptx_to_text(); 
      } 
     } else { 
      return "Invalid File Type"; 
     } 
    } 

} 

Document_file_format文件是二進制文件blobs.They可以通過使用fopen。而.DOCX文件讀取只是ZIP文件和XML文件xml files in a zipfile container (source wikipedia)您可以通過使用zip_open閱讀。

使用上面的類

$docObj = new DocxConversion("test.doc"); 
//$docObj = new DocxConversion("test.docx"); 
//$docObj = new DocxConversion("test.xlsx"); 
//$docObj = new DocxConversion("test.pptx"); 
echo $docText= $docObj->convertToText(); 
+0

感謝您對本真棒答案.. 我試圖用你的代碼..但它不工作..它顯示我空..我的東西文件我的'郵編'不工作..你能幫我找到問題.. –

+0

這個類讀取數以千計的文件有多高效?它是一次讀取整個內容還是可以先讀取第一頁? – Volatil3

+0

@M Khalid Junaid如何讀取圖像或數學類型方程的docx文件,是否有一些程序來做到這一點或圖書館? – Strawberry

8

的從DOC文件

$filename = 'ypue file';   
    if (file_exists($filename)) {   

    if (($fh = fopen($filename, 'r')) !== false) { 

    $headers = fread($fh, 0xA00); 

    $n1 = (ord($headers[0x21C]) - 1); 

    $n2 = ((ord($headers[0x21D]) - 8) * 256); 

    $n3 = ((ord($headers[0x21E]) * 256) * 256); 

    $n4 = (((ord($headers[0x21F]) * 256) * 256) * 256); 


    $textLength = ($n1 + $n2 + $n3 + $n4); 

    $extracted_plaintext = fread($fh, $textLength); 

    echo nl2br($extracted_plaintext); 

    print_r(extract_emails_from($extracted_plaintext));  

     } 

     } 

    function extract_emails_from($string) { 
      preg_match_all("/[\._a-zA-Z0-9-][email protected][\._a-zA-Z0-9-]+/i", $string, $matches); 
      return $matches[0]; 
    } 

從DOCX:

/*Name of the document file*/ 
    $document = 'your file'; 

    /**Function to extract text*/ 
    function extracttext($filename) { 
     //Check for extension 
     $ext = end(explode('.', $filename)); 

    //if its docx file 
    if($ext == 'docx') 
    $dataFile = "word/document.xml"; 
    //else it must be odt file 
    else 
    $dataFile = "content.xml";  

    //Create a new ZIP archive object 
    $zip = new ZipArchive; 

    // Open the archive file 
    if (true === $zip->open($filename)) { 
     // If successful, search for the data file in the archive 
     if (($index = $zip->locateName($dataFile)) !== false) { 
      // Index found! Now read it to a string 
      $text = $zip->getFromIndex($index); 
      // Load XML from a string 
      // Ignore errors and warnings 
      $xml = DOMDocument::loadXML($text, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); 
      // Remove XML formatting tags and return the text 
      return strip_tags($xml->saveXML()); 
     } 
     //Close the archive file 
     $zip->close(); 
    } 

    // In case of failure return a message 
    return "File not found"; 
} 

echo extracttext($document); 
+1

嗨,如果可以添加一些評論以使其更容易理解,那麼在理解它時會遇到一些問題。 – OshoParth

0

對於DOCX文檔,我建議使用docx2txt工具(可至少在Debian/Ubuntu):

docx2txt < your_file.docx 

README解釋如何將它與vim集成。添加到你的.vimrc

" use docx2txt.pl to allow VIm to view the text content of a .docx file directly. 
autocmd BufReadPre *.docx set ro 
autocmd BufReadPost *.docx %!docx2txt 

(它也解釋瞭如何與emacs集成)。

對於黑客來說,這個工具是用perl編寫的。

+1

這僅適用於docx,不適用於.docs –

1

//對於DOCX.If你想保留空格,還要照顧表tr和tc,使用下面的代碼:修改它的口味。因爲這一切從下載遠程或本地

//=========DOCX=========== 
function extractDocxText($url,$file_name){ 
     $docx = get_url($url); 
     file_put_contents("tempf.docx",$docx); 
     $xml_filename = "word/document.xml"; //content file name 
     $zip_handle = new ZipArchive; 
     $output_text = ""; 
     if(true === $zip_handle->open("tempf.docx")){ 
      if(($xml_index = $zip_handle->locateName($xml_filename)) !== false){ 
       $xml_datas = $zip_handle->getFromIndex($xml_index); 
       //file_put_contents($input_file.".xml",$xml_datas); 
       $replace_newlines = preg_replace('/<w:p w[0-9-Za-z]+:[a-zA-Z0-9]+="[a-zA-z"0-9 :="]+">/',"\n\r",$xml_datas); 
       $replace_tableRows = preg_replace('/<w:tr>/',"\n\r",$replace_newlines); 
       $replace_tab = preg_replace('/<w:tab\/>/',"\t",$replace_tableRows); 
       $replace_paragraphs = preg_replace('/<\/w:p>/',"\n\r",$replace_tab); 
       $replace_other_Tags = strip_tags($replace_paragraphs);   
       $output_text = $replace_other_Tags; 
      }else{ 
       $output_text .=""; 
      } 
      $zip_handle->close(); 
     }else{ 
     $output_text .=" "; 
     } 
     chmod("tempf.docx", 0777); unlink(realpath("tempf.docx")); 
     //save to file or echo content 
     file_put_contents($file_name,$output_text); 
     echo $output_text; 
    } 

//========PDF=========== 
//Requires installation in your Linux server 
//sudo su 
//apt-get install xpdf 
function extractPdfText($url,$PDF_fullpath_or_Filename){ 
    $pdf = get_url($url); 
    file_put_contents ("temppdf.txt", $pdf); 
    $content = pdf2text("temppdf.txt"); 
    chmod("temppdf.txt", 0777); unlink(realpath("temppdf.txt")); 
    echo $content; 
    file_put_contents($PDF_fullpath_or_Filename,$content); 
    } 



//========DOC========== 
function extractDocText($url,$file_name){ 
    $doc = get_url($url); 
    file_put_contents ("tempf.txt", $doc); 

    $fileHandle = fopen("tempf.txt", "r"); 
    $line = @fread($fileHandle, filesize("tempf.txt")); 
    $lines = explode(chr(0x0D),$line); 
    $outtext = ""; 
    foreach($lines as $thisline){ 
     $pos = strpos($thisline, chr(0x00)); 
     if (($pos !== FALSE)||(strlen($thisline)==0)) 
     {} else {$outtext .= $thisline."\n\r";} 
     } 
    $content = preg_replace('/[a-zA-Z0-9\s\,\.\-\n\r\[email protected]\/\_\(\)]/',' ',$outtext); 

    //chmod("tempf.txt", 0777); unlink(realpath("tempf.txt")); 
    echo $content; 
    file_put_contents($file_name,$content); 
    } 


//========XLSX========== 
function extractXlsxText($url,$file_name){ 
    $xlsx = get_url($url); 
    file_put_contents ("tempf.txt", $xlsx); 
    $content = ""; 
    $dir = 'tempforxlsx'; 
    // Unzip 
    $zip = new ZipArchive(); 
    $zip->open("tempf.txt"); 
    $zip->extractTo($dir); 
    // Open up shared strings & the first worksheet 
    $strings = simplexml_load_file($dir . '/xl/sharedStrings.xml'); 
    $sheet = simplexml_load_file($dir . '/xl/worksheets/sheet1.xml'); 
    // Parse the rows 
    $xlrows = $sheet->sheetData->row; 
    foreach ($xlrows as $xlrow) { 
     $arr = array(); 

     // In each row, grab it's value 
     foreach ($xlrow->c as $cell) { 
      $v = (string) $cell->v; 

      // If it has a "t" (type?) of "s" (string?), use the value to look up string value 
      if (isset($cell['t']) && $cell['t'] == 's') { 
       $s = array(); 
       $si = $strings->si[(int) $v]; 

       // Register & alias the default namespace or you'll get empty results in the xpath query 
       $si->registerXPathNamespace('n', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'); 
       // Cat together all of the 't' (text?) node values 
       foreach($si->xpath('.//n:t') as $t) { 
        $content .= $t." ";} } 
      } 
     } 
    echo $content; 
    file_put_contents($file_name,$content); 
    } 


//========PPT========== 
function extractPptText($url,$file_name){ 
    $ppt = file_get_contents($url); 
    file_put_contents ("tempf.ppt", $ppt); 
    $fileHandle = fopen("tempf.ppt", "r"); 
    $line = @fread($fileHandle, filesize("tempf.ppt")); 
    $lines = explode(chr(0x0f),$line); 
    $outtext = ''; 

    foreach($lines as $thisline) { 
     if (strpos($thisline, chr(0x00).chr(0x00).chr(0x00)) == 1) { 
      $text_line = substr($thisline, 4); 
      $end_pos = strpos($text_line, chr(0x00)); 
      $text_line = substr($text_line, 0, $end_pos); 
      $text_line = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\[email protected]\/\_\(\)]/'," ",$text_line); 
      $outtext = substr($text_line, 0, $end_pos)."\n".$outtext; 
     } 
    } 
    //echo $outtext; 
    file_put_contents($file_name,$outtext); 
    } 

//========PPTX========== 
function extractPptxText($url,$file_name){ 
    $xls = get_url($url); 
    file_put_contents ("tempf.txt", $xls); 
    $zip_handle = new ZipArchive; 
    $output_text = ' '; 
    if(true === $zip_handle->open("tempf.txt")){ 
     $slide_number = 1; //loop through slide files 
     while(($xml_index = $zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){ 
      $xml_datas = $zip_handle->getFromIndex($xml_index); // these four lines of codes 
                   // below were 
      $xml_handle = new DOMDocument();     // added by me in order 
      $xml_handle->preserveWhiteSpace = true;    // to preserve space between 
      $xml_handle->formatOutput = true;     // each read data 
      $xml_handle->loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); 
      $output_text .= $xml_handle->saveXML(); 
      $slide_number++; 
      } 
     if($slide_number == 1){ 
      $output_text .= ""; 
     } 
     $zip_handle->close(); 
    }else{ 
    $output_text .= ""; 
    } 
    echo $output_text; 
    file_put_contents($file_name,$output_text); 
    } 

    /* 

========================================================================== 
========================================================================= 
And below is get_url() function: Better than fie_get_contents(); 
*/ 

function get_url($url,$timeout = 5) 
    { 
     $url = str_replace("&amp;", "&", urldecode(trim($url))); 
     $ch = curl_init(); 
     curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1"); 
     curl_setopt($ch, CURLOPT_URL, $url); 
     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 
     curl_setopt($ch, CURLOPT_ENCODING, ""); 
     curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); 
     curl_setopt($ch, CURLOPT_AUTOREFERER, true); 
     curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); # required for https urls 
     curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
     curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); 
     curl_setopt($ch, CURLOPT_MAXREDIRS, 10); 
     $content = curl_exec($ch); 
     //$response = curl_getinfo($ch); 
     curl_close ($ch); 
     return $content; 
    } 
相關問題