2012-06-04 165 views
0

我想要搜索PDF文件中的字符串的方式strstr()確實在PDF文件中的字符串。但是當pdf以純文本的形式閱讀時,它會給你一個不可理解的垃圾。我該怎麼做?搜索使用PHP

也許pdflib有一些解決方案,但我的託管服務提供商不幫我安裝它。

+1

看到這個帖子http://stackoverflow.com/questions/1882318/search-through-pdf-files-with-php –

+1

你可以看到這個發帖太:http://stackoverflow.com/questions/1004478/read-pdf-files-with-php – Val

回答

2

你不僅擁有的PDFlib庫,你也可以用FPDFTCPDF

和一個約PDF閱讀與PHP here好文章。

2

使用前一個響應中的鏈接爲了滿足我的需求,我改變了一些代碼,它們是SEARCH AND REPLACE,文章中的函數不需要額外的庫,但是您需要用於REPLACE的FPDI庫部分。

下面這段代碼也不是完美的,但它是一個良好的開端,任何人尋找類似的東西。

這個腳本的主要問題是新數據被覆蓋在原始pdf模板的一種圖像上,這意味着如果你想用5行文本替換2行文本,它不會插入文本,但覆蓋2行再加3個。這也意味着,無論您嘗試覆蓋哪些內容,都必須具有匹配的背景。

但對於一些基本需求,例如像具有像一個模板PDF一個很好的商業計劃書或報價,該腳本可以覆蓋的日期或客戶名稱,全部在PHP應用程序。

function decodeAsciiHex($input) { 
    $output = ""; 

    $isOdd = true; 
    $isComment = false; 

    for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { 
     $c = $input[$i]; 

     if($isComment) { 
      if ($c == '\r' || $c == '\n') 
       $isComment = false; 
      continue; 
     } 

     switch($c) { 
      case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break; 
      case '%': 
       $isComment = true; 
      break; 

      default: 
       $code = hexdec($c); 
       if($code === 0 && $c != '0') 
        return ""; 

       if($isOdd) 
        $codeHigh = $code; 
       else 
        $output .= chr($codeHigh * 16 + $code); 

       $isOdd = !$isOdd; 
      break; 
     } 
    } 

    if($input[$i] != '>') 
     return ""; 

    if($isOdd) 
     $output .= chr($codeHigh * 16); 

    return $output; 
} 

function decodeAscii85($input) { 
    $output = ""; 

    $isComment = false; 
    $ords = array(); 

    for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { 
     $c = $input[$i]; 

     if($isComment) { 
      if ($c == '\r' || $c == '\n') 
       $isComment = false; 
      continue; 
     } 

     if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ') 
      continue; 
     if ($c == '%') { 
      $isComment = true; 
      continue; 
     } 
     if ($c == 'z' && $state === 0) { 
      $output .= str_repeat(chr(0), 4); 
      continue; 
     } 
     if ($c < '!' || $c > 'u') 
      return ""; 

     $code = ord($input[$i]) & 0xff; 
     $ords[$state++] = $code - ord('!'); 

     if ($state == 5) { 
      $state = 0; 
      for ($sum = 0, $j = 0; $j < 5; $j++) 
       $sum = $sum * 85 + $ords[$j]; 
      for ($j = 3; $j >= 0; $j--) 
       $output .= chr($sum >> ($j * 8)); 
     } 
    } 
    if ($state === 1) 
     return ""; 
    elseif ($state > 1) { 
     for ($i = 0, $sum = 0; $i < $state; $i++) 
      $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); 
     for ($i = 0; $i < $state - 1; $i++) 
      $ouput .= chr($sum >> ((3 - $i) * 8)); 
    } 

    return $output; 
} 

function decodeFlate($input) { 
    return @gzuncompress($input); 
} 

function getObjectOptions($object) { 
    $options = array(); 
    if (preg_match("#<<(.*)>>#ismU", $object, $options)) { 
     $options = explode("/", $options[1]); 
     @array_shift($options); 

     $o = array(); 
     for ($j = 0; $j < @count($options); $j++) { 
      $options[$j] = preg_replace("#\s+#", " ", trim($options[$j])); 
      if (strpos($options[$j], " ") !== false) { 
       $parts = explode(" ", $options[$j]); 
       $o[$parts[0]] = $parts[1]; 
      } else 
       $o[$options[$j]] = true; 
     } 
     $options = $o; 
     unset($o); 
    } 

    return $options; 
} 
function getDecodedStream($stream, $options) { 
    $data = ""; 
    if (empty($options["Filter"])) 
     $data = $stream; 
    else { 
     $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream); 
     $_stream = substr($stream, 0, $length); 

     foreach ($options as $key => $value) { 
      if ($key == "ASCIIHexDecode") 
       $_stream = decodeAsciiHex($_stream); 
      if ($key == "ASCII85Decode") 
       $_stream = decodeAscii85($_stream); 
      if ($key == "FlateDecode") 
       $_stream = decodeFlate($_stream); 
     } 
     $data = $_stream; 
    } 
    return $data; 
} 
function getDirtyTexts(&$texts, $textContainers) { 
    for ($j = 0; $j < count($textContainers); $j++) { 
     if (preg_match_all("#\[(.*)\]\s*TJ#ismU", $textContainers[$j], $parts)) 
      $texts = array_merge($texts, @$parts[1]); 
     elseif(preg_match_all("#Td\s*(\(.*\))\s*Tj#ismU", $textContainers[$j], $parts)) 
      $texts = array_merge($texts, @$parts[1]); 
    } 
} 
function getCharTransformations(&$transformations, $stream) { 
    preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER); 
    preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER); 

    for ($j = 0; $j < count($chars); $j++) { 
     $count = $chars[$j][1]; 
     $current = explode("\n", trim($chars[$j][2])); 
     for ($k = 0; $k < $count && $k < count($current); $k++) { 
      if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) 
       $transformations[str_pad($map[1], 4, "0")] = $map[2]; 
     } 
    } 
    for ($j = 0; $j < count($ranges); $j++) { 
     $count = $ranges[$j][1]; 
     $current = explode("\n", trim($ranges[$j][2])); 
     for ($k = 0; $k < $count && $k < count($current); $k++) { 
      if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) { 
       $from = hexdec($map[1]); 
       $to = hexdec($map[2]); 
       $_from = hexdec($map[3]); 

       for ($m = $from, $n = 0; $m <= $to; $m++, $n++) 
        $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n); 
      } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) { 
       $from = hexdec($map[1]); 
       $to = hexdec($map[2]); 
       $parts = preg_split("#\s+#", trim($map[3])); 

       for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) 
        $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n])); 
      } 
     } 
    } 
} 
function getTextUsingTransformations($texts, $transformations) { 
    $document = ""; 
    for ($i = 0; $i < count($texts); $i++) { 
     $isHex = false; 
     $isPlain = false; 

     $hex = ""; 
     $plain = ""; 
     for ($j = 0; $j < strlen($texts[$i]); $j++) { 
      $c = $texts[$i][$j]; 
      switch($c) { 
       case "<": 
        $hex = ""; 
        $isHex = true; 
       break; 
       case ">": 
        $hexs = str_split($hex, 4); 
        for ($k = 0; $k < count($hexs); $k++) { 
         $chex = str_pad($hexs[$k], 4, "0"); 
         if (isset($transformations[$chex])) 
          $chex = $transformations[$chex]; 
         $document .= html_entity_decode("&#x".$chex.";"); 
        } 
        $isHex = false; 
       break; 
       case "(": 
        $plain = ""; 
        $isPlain = true; 
       break; 
       case ")": 
        $document .= $plain; 
        $isPlain = false; 
       break; 
       case "\\": 
        $c2 = $texts[$i][$j + 1]; 
        if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; 
        elseif ($c2 == "n") $plain .= '\n'; 
        elseif ($c2 == "r") $plain .= '\r'; 
        elseif ($c2 == "t") $plain .= '\t'; 
        elseif ($c2 == "b") $plain .= '\b'; 
        elseif ($c2 == "f") $plain .= '\f'; 
        elseif ($c2 >= '0' && $c2 <= '9') { 
         $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); 
         $j += strlen($oct) - 1; 
         $plain .= html_entity_decode("&#".octdec($oct).";"); 
        } 
        $j++; 
       break; 

       default: 
        if ($isHex) 
         $hex .= $c; 
        if ($isPlain) 
         $plain .= $c; 
       break; 
      } 
     } 
     $document .= "\n"; 
    } 

    return $document; 
} 

function pdf2text($filename, $search) { 
    $pageNumber = 1; 
    $infile = @file_get_contents($filename, FILE_BINARY); 
    if (empty($infile)) 
     return ""; 

    $transformations = array();  

    preg_match_all("#obj(.*)endobj#ismU", $infile, $objects); 
    $objects = @$objects[1]; 

    for ($i = 0; $i < count($objects); $i++) { 
     $texts = array(); 
     $currentObject = $objects[$i]; 

     if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) { 
      $stream = ltrim($stream[1]); 

      $options = getObjectOptions($currentObject); 
      if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) 
       continue; 

      $data = getDecodedStream($stream, $options); 
      if (strlen($data)) { 
       if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) { 
        //print_r($textContainers); 
        $textContainers = @$textContainers[1]; 
        //print_r($textContainers);print "<br><br>"; 
        getDirtyTexts($texts, $textContainers); 
        //print_r($textContainers); 
       }else{ 
        getCharTransformations($transformations, $data); 
       } 
      } 

      //print "HOJA = ".$pageNumber; 
      //$textTransformed = getTextUsingTransformations($texts, $transformations); 
      //print $textTransformed."<br><br><br>"; 
      //print_r($currentObject); 
      //print_r($options); 
      //print_r($stream); 
      //print_r($data); 
      //print ($textContainers[0])."<br>"; 
      //print_r($textContainers); 
      //print_r($texts);    
      //print "<br><br><br>"; 

      $m = array();    
      foreach ($texts as $key => $value) { 
       $prearray = array($value); 
       //print_r($textContainers[$key]); 
       preg_match("/\/\F[0-9]\s[0-9][0-9]/",$textContainers[$key],$m); 
       if(!empty($m)){ 
        $fontSize = array(); 
        $fontSize = explode(" ", $m[0]); 
        $fontSize = array_filter($fontSize); 
        //print_r($fontSize); 
       } 

       $textTransformed = getTextUsingTransformations($prearray, $transformations); 
       //print $textTransformed."<br><br><br>"; 
       $pos = strpos($textTransformed, $search); 
       if($pos !== false){ 
        $data = NULL; 
        $pos = strpos($textContainers[$key], "Tm"); 
        if($pos !== false){ 
         $data = substr($textContainers[$key], 0, $pos); 
        }else{ 
         $pos = strpos($textContainers[$key], "Td"); 
         if($pos !== false){ 
          $data = substr($textContainers[$key], 0, $pos); 
         } 
        } 
        if(!empty($data)){ 
         $dataArray = array(); 
         $dataArray = explode(" ", $data); 
         $dataArray = array_filter($dataArray); 

         $returnArray = array(); 
         $returnArray['keyword'] = $search; 
         $returnArray['page'] = $pageNumber; 
         $returnArray['font'] = $fontSize[1]; 
         $returnArray['x'] = $dataArray[count($dataArray)]; 
         $returnArray['y'] = $dataArray[count($dataArray)+1];       

         return $returnArray; 
        } 
       } 
      } 
      $pageNumber++; 
     } 
    } 

    //return getTextUsingTransformations($texts, $transformations); 
    return false; 
} 

現在的用法:

我所做的是節能與將被覆蓋的一些「關鍵詞」 PDF模板文件,例如爲了改變模板上PDF的日期我把我想要的關鍵字@DATE,其他關鍵字等。如果你想找的搜索詞是不是在PDF中發現

修改後的pdf2text函數返回false。如果找到它,它將返回一個包含搜索關鍵字,找到它的頁面,單詞的字體大小以及X和Y值的數組。

// initiate FPDI 
$pdf = new FPDI('P', 'pt', 'A4'); 
$pdf->setPrintHeader(false); 

// set the source file 
$pdffile = "template.pdf"; 
$numberOfPages = $pdf->setSourceFile($pdffile); 

$theReturnedFecha = pdf2text($pdffile, "@DATE"); 
$theReturnedCliente = pdf2text($pdffile, "@CLIENT"); 


// import all pages 
for($i = 1; $i <= $numberOfPages; $i++){ 
    $pdf->AddPage(); 
    $tplIdx = $pdf->importPage($i); 
    $pdf->useTemplate($tplIdx); 
    if($theReturnedFecha !== false){ 
     if($theReturnedFecha['page'] == $i){ 
      $pdf->SetFont('Helvetica'); 
      $pdf->SetFontSize($theReturnedFecha['font']); 
      $pdf->SetTextColor(0, 0, 0); 
      $pdf->SetFillColor(255,255,255); 
      $pdf->SetXY($theReturnedFecha['x'], ($theReturnedFecha['y']*-1)-$theReturnedFecha['font']); 
      $pdf->Cell(0, 0, dateInSpanish("2015-11-04"), 0, 0, 'L', true); 
     } 
     //print_r($theReturnedFecha); 
    } 

    if($theReturnedCliente !== false){ 
     if($theReturnedCliente['page'] == $i){ 
      $pdf->SetFont('Helvetica'); 
      $pdf->SetFontSize($theReturnedCliente['font']); 
      $pdf->SetTextColor(0, 0, 0); 
      $pdf->SetFillColor(255,255,255); 
      $pdf->SetXY($theReturnedCliente['x'], ($theReturnedCliente['y']*-1)-$theReturnedCliente['font']); 
      $pdf->Cell(0, 0, $x_nombre_cli, 0, 0, 'L', true); 
     } 
     //print_r($theReturnedFecha); 
    }   
} 

$pdf->Output(); 

希望它可以幫助任何人

再見

+0

謝謝你。不幸的是,它似乎不工作pdf2text無法返回任何參考。您測試過哪些pdf版本? – DeveloperChris