2014-02-12 31 views
0

我有一個程序使用PDF文檔的正則表達式提取文本。爲什麼這個正則表達式匹配塊需要很長時間才能完成?

我的問題是匹配塊需要較長時間對於某些PDF文件執行...

這是代碼:

String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)"; 
String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))"; 
String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))"; 
String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)"; 
String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))"; 
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))"; 
String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)"; 
String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)"; 

String region = stripper.getTextForRegion("class1"); 
String regiont = stripper.getTextForRegion("class2"); 

Pattern p = Pattern.compile(in); 
Matcher m = p.matcher(region); 

Pattern p2 = Pattern.compile(as); 
Matcher m2 = p2.matcher(region); 

Pattern p3 = Pattern.compile(title); 
Matcher m3 = p3.matcher(region); 

Pattern p4 = Pattern.compile(pat_no); 
Matcher m4 = p4.matcher(regiont); 

Pattern p5 = Pattern.compile(app_no); 
Matcher m5 = p5.matcher(region); 

Pattern p6 = Pattern.compile(filed); 
Matcher m6 = p6.matcher(region); 

Pattern p7 = Pattern.compile(pat_dt); 
Matcher m7 = p7.matcher(regiont); 

long TIMEOUT = 15000l; // 15 seconds 
long now = System.currentTimeMillis(); // init the long just above the while 

System.out.println("find start"); 

while(m.find()) 
{ 
    // System.out.println(m.group()); 
} 

Long nowtime = System.currentTimeMillis() ; 

while(m2.find()) 
{ 
    // System.out.println(m2.group()); 

} 

while(m3.find() && (System.currentTimeMillis() - now) < TIMEOUT) 
{ 
    // System.out.println(m3.group()); 
    patit = m3.group().replace("(54)", " "); 
    patit = patit.trim(); 
    // System.out.println("m3"); 
} 

while(m4.find() && (System.currentTimeMillis() - now) < TIMEOUT) 
{ 
    // System.out.println(m4.group()); 
    patno = m4.group().replace("Patent No.: ", " "); 
    patno = patno.replace("Patent No: ", " "); 
    patno = patno.replace("Patent", " "); 
    patno = patno.replace("No.:", " "); 
    patno = patno.replace("No:", " "); 
    patno = patno.replace("Number: ", " "); 
    patno = patno.replace("Number.: ", " "); 
    patno = patno.trim(); 
    // System.out.println("m4"); 
} 

while(m5.find() && (System.currentTimeMillis() - now) < TIMEOUT) 
{ 
    // System.out.println(m5.group()); 
    appno = m5.group().replace("(21)", " "); 
    appno = appno.replace("Appl. No.: ", " "); 
    appno = appno.replace("Appl.", " "); 
    appno = appno.replace("No.", " "); 
    appno = appno.replace(":"," "); 
    appno = appno.trim(); 
    // System.out.println("m5"); 
} 


while(m6.find() && (System.currentTimeMillis() - now) < TIMEOUT) 
{ 
    // System.out.println(m6.group()); 
    patfilled = m6.group().replace("(22)", " "); 
    patfilled = patfilled.replace("Filed", " "); 
    patfilled= patfilled.replace("PCT", " "); 
    patfilled = patfilled.replace(":", " "); 
    patfilled = patfilled.replace("\n", ""); 
    patfilled= patfilled.trim(); 
    // System.out.println("m6"); 
} 

while (m7.find() && (System.currentTimeMillis() - now) < TIMEOUT) 
{ 
    patdate = m7.group().replace("(45) Date of Patent: ", " "); 
    patdate = patdate.replace("(45) Date of Patent.: ", " "); 
    patdate = patdate.replace("(45)", " "); 
    patdate = patdate.replace("Date", " "); 
    patdate = patdate.replace("of", " "); 
    patdate = patdate.replace("Patent.: ", " "); 
    patdate = patdate.replace("Patent: ", " "); 
    patdate = patdate.replace("Reissued", " "); 
    patdate = patdate.replace(":", " "); 
    patdate = patdate.replace("Patent", " "); 
    patdate = patdate.replace("*", " "); 
    patdate = patdate.trim(); 
    // System.out.println("m7"); 
} 

System.out.println("find end"); 

在上面的代碼中,mX.find()需要很長執行某些文件迭代的時間......這就是執行在一些迭代處凍結在System.out.println("find start");

這是輸出示例:(滾動看到)

------- 
    find start 
    1ms Elasped 
    1841 
    File name:06377334.pdf 
    US 6,377,334 B2 
    METHOD FOR CONTROLLING IMAGE 
    SIZE OF INTEGRATED CIRCUITS ON 
    WAFERS SUPPORTED ON HOT PLATES 
    DURING POST EXPOSURE BAKING OF THE 
    WAFERS 
    Apr. 23, 2002 
    Jan. 24, 2001 
    Related U.S. Application Data 
    09/768,973 
    ------- 
    find start 
    1ms Elasped 
    1842 
    File name:06377337.pdf 
    US 6,377,337 B1 
    PROJECTION EXPOSURE APPARATUS 
    Apr. 23, 2002 
    Apr. 27, 1999 
    09/299,558 
    ------- 
    find start 
    1843 
    File name:06377338.pdf 
    US 6,377,338 B1 
    EXPOSURE APPARATUS AND METHOD 
    Apr. 23, 2002 
    Oct. 13, 2000 
    Related U.S. Application Data 
    09/299,558 
    ------- 
    find start 
    1844 
    File name:06377339.pdf 
    US 6,377,339 B1 
    DOCUMENT IMAGING SYSTEM 
    INCORPORATING A SELECTIVELY 
    OPAQUE 
    Apr. 23, 2002 
    Mar. 29, 1999 
    09/280,186 
    ------- 
    find start 
    1845 
    File name:06377340.pdf 
    US 6,377,340 B1 
    METHOD OF DETECTION OF NATURAL 
    DIAMONDS THAT HAVE BEEN PROCESSED 
    AT HIGH PRESSURE AND HIGH 
    TEMPERATURES 
    Apr. 23, 2002 
    Oct. 29, 1999 
    09/430,477 
    ------- 
    find start 
    1846 
    File name:06377341.pdf 
    US 6,377,341 B1 
    REFRACTIVE INDEX BASED DETECTOR 
    SYSTEM FOR LIQUID CHROMATOGRAPHY 
    Apr. 23, 2002 
    Aug. 3, 1999 
    09/368,310 
    ------- 
    find start 

(execution freezes here) 

爲什麼出現這種情況?爲什麼正則表達式匹配器需要很長時間?


在這裏,整個程序:

import java.awt.Rectangle; 
import java.io.File; 
import java.io.IOException; 
import java.io.PrintWriter; 
import java.sql.Connection; 
import java.sql.DriverManager; 
import java.sql.PreparedStatement; 
import java.util.ArrayList; 
import java.util.List; 
import java.util.regex.Matcher; 
import java.util.regex.Pattern; 

import org.apache.commons.io.FileUtils; 
import org.apache.commons.io.filefilter.TrueFileFilter; 
import org.apache.commons.io.filefilter.WildcardFileFilter; 
import org.apache.pdfbox.exceptions.InvalidPasswordException; 
import org.apache.pdfbox.pdmodel.PDDocument; 
import org.apache.pdfbox.pdmodel.PDPage; 
import org.apache.pdfbox.util.PDFTextStripperByArea; 


public class PatentAdder { 

    /** 
    * @param args 
    */ 

    public static String patno,patit,patdate,patfilled,appno; 
    private static int File; 
    public static void main(String[] args) { 

     try { 

int cnt=0; 

     if(args.length == 1) 
     { 
      // usage(); 
     } 
     else 
     { 
      PDDocument document = null; 
      try 
      { 
        File dataDir = new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs"); 

        File[] files = dataDir.listFiles(); 


       int count=0; 


        long TIMEOUT1 = 60000l; // 15 seconds 
        long now1 = System.currentTimeMillis(); 

         for (File file : files) { 

        try { 
        // System.out.println ("Satrt2"); 
         File f = file; 

         if (!f.isDirectory()) { 
       document = PDDocument.load(f.getAbsolutePath()); 
       if(document.isEncrypted()) 
       { 
        try 
        { 
         document.decrypt(""); 
        } 
        catch(InvalidPasswordException e) 
        { 
         System.err.println("Error: Document is encrypted with a password."); 
         System.exit(1); 
        } 
       } } 

       PDFTextStripperByArea stripper = new PDFTextStripperByArea(); 
       stripper.setSortByPosition(true); 

       Rectangle rectt = new Rectangle(288, 60, 222, 40); 
      Rectangle rect = new Rectangle(55, 108, 230, 600); // US-Patent title h40 

       stripper.addRegion("class1", rect); 
       stripper.addRegion("class2", rectt); 


       List allPages = document.getDocumentCatalog().getAllPages(); 
       PDPage firstPage = (PDPage)allPages.get(0); 
       stripper.extractRegions(firstPage); 


       String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)"; 
       String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))"; 
       String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))"; 
       String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)"; 
       String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))"; 
       String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))"; 
       String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)"; 
       String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)"; 

       String region = stripper.getTextForRegion("class1"); 

       String regiont = stripper.getTextForRegion("class2"); 

       Pattern p = Pattern.compile(in); 
       Matcher m = p.matcher(region); 

       Pattern p2 = Pattern.compile(as); 
       Matcher m2 = p2.matcher(region); 

       Pattern p3 = Pattern.compile(title); 
       Matcher m3 = p3.matcher(region); 

       Pattern p4 = Pattern.compile(pat_no); 
       Matcher m4 = p4.matcher(regiont); 

       Pattern p5 = Pattern.compile(app_no); 
       Matcher m5 = p5.matcher(region); 

       Pattern p6 = Pattern.compile(filed); 
       Matcher m6 = p6.matcher(region); 


       Pattern p7 = Pattern.compile(pat_dt); 
       Matcher m7 = p7.matcher(regiont); 


       System.out.println("find start");          
       Long nowtime = System.currentTimeMillis() ; 



       while(m3.find()) 
       { 
        patit = m3.group().replace("(54)", " "); 
        patit = patit.trim(); 

       } 

       while(m4.find()) 
       { 

        patno = m4.group().replace("Patent No.: ", " "); 
        patno = patno.replace("Patent No: ", " "); 
        patno = patno.replace("Patent", " "); 
        patno = patno.replace("No.:", " "); 
        patno = patno.replace("No:", " "); 
        patno = patno.replace("Number: ", " "); 
        patno = patno.replace("Number.: ", " "); 
        patno = patno.trim(); 

       } 

       while(m5.find()) 
       { 

       appno = m5.group().replace("(21)", " "); 
       appno = appno.replace("Appl. No.: ", " "); 
       appno = appno.replace("Appl.", " "); 
       appno = appno.replace("No.", " "); 
       appno = appno.replace(":"," "); 
       appno = appno.trim(); 


       } 


       while(m6.find()) 
       { 

        patfilled = m6.group().replace("(22)", " "); 
        patfilled = patfilled.replace("Filed", " "); 
        patfilled= patfilled.replace("PCT", " "); 
        patfilled = patfilled.replace(":", " "); 
        patfilled = patfilled.replace("\n", ""); 
        patfilled= patfilled.trim(); 

       } 

       while (m7.find()) 
       { 
        patdate = m7.group().replace("(45) Date of Patent: ", " "); 
        patdate = patdate.replace("(45) Date of Patent.: ", " "); 
        patdate = patdate.replace("(45)", " "); 
        patdate = patdate.replace("Date", " "); 
        patdate = patdate.replace("of", " "); 
        patdate = patdate.replace("Patent.: ", " "); 
        patdate = patdate.replace("Patent: ", " "); 
        patdate = patdate.replace("Reissued", " "); 
        patdate = patdate.replace(":", " "); 
        patdate = patdate.replace("Patent", " "); 
        patdate = patdate.replace("*", " "); 
        patdate = patdate.trim(); 


       }    



       PrintWriter out = new PrintWriter (new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs/output.txt")); 
       System.out.println(count); 
       out.println(count); 

       System.out.println("File name:"+f.getName()); 
       out.println("File name:"+f.getName()); 

       System.out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------"); 
       out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------"); 

       Long endtime = System.currentTimeMillis()-nowtime; 
       System.out.println(endtime+"ms Elasped") ; 
       out.println(endtime+"ms Elasped") ; 

        count++; 

       } 
        catch (IOException e) 
         { 
          continue; 
         } 


       } 


        System.out.print("-----Finised "+count+" Files------ \n"); 


      } 
      finally 
      { 
       if(document != null) 
       { 
        document.close(); 
       } 
      } 


     } 

     } 

     catch (Exception e) 
     { 
      System.out.println(e.getStackTrace()); 
      //System.out.println(e.getLocalizedMessage()); 
      System.out.println(e.getMessage()); 
      System.out.println(e.getCause()); 
      //System.out.println(e.getClass()); 
      e.printStackTrace(); 


     } 

    } 

請告訴如何optamize正則表達式和解決這個執行凍結問題...

+1

我很抱歉,但我不認爲我們可以提供幫助。它可以是任何數量的東西。也許目錄中有一個損壞的PDF文件或類似的東西。也許在PDF中的特定頁面上有些不同。有些情況下,我們不會有任何幫助。你有沒有看過這個凍結的文件,圍繞1847年的記錄看看有什麼不同?你有沒有試過一個條件調試器,當你到達那條記錄時會觸發,這樣你就可以遍歷代碼並查看可能發生了什麼? –

回答

0

我喜歡用正則表達式自己,但它看起來像這不是你嘗試去做的理想方法。正則表達式很適合從文本中提取特定位置的信息。但是,在文本上反覆應用多個正則表達式意味着解析方法會更好。

您的方法的一個問題是每個while循環都是再次讀取整個文本。這可以通過編寫自己的解析器並讓它在文檔中一次完成來避免。

你的正則表達式的另一個問題是他們有很多可選的部分(\s*等)。可選部件使得對正則表達式的評估成本更高。相反,使用一個非常簡單的正則表達式可能是一個好方法,並重新檢查它的匹配位置是否有誤報。例如,而不是你的正則表達式

String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))"; 

,你可以隨便找

String simple_term ="Term"; 

然後對Term每次出現檢查它是否真的是你正在尋找的一部分。

順便看一下我從代碼中隨機抽取的字符串,我注意到它比它更復雜。只是刪除備選|,因爲第一個和第二個選擇是一樣的。

+1

'可選部分使正則表達式的評估更加昂貴。'它確實,但不會導致這樣的減速。減速是由於'\ s * \ s *'問題引起的。 – nhahtdh

0

由於您的構造類似於\s*\s*c - 連續重複的字符類,它具有非空的交叉點,後面是非相交的續集,所以會遇到回溯地獄。

讓我們來看看字符串in(由發動機所看到):

((?s)\(\d\d\)\s+Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=\(\d*\)\s+Assignee:)) 
| 
((?s)\[\d\d\)\s+Inventor:\s*([\-\w\d\s,\.\(\)-]+)*[\w\']*(?=\n)) 
| 
(Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=Assignee:)) 
| 
((?s)\(\d\d\)\s+Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=\(\d*\)\s+Assignee:)) 
| 
((?s)\(\d\d\)\s+Inventor:\s*([\-\w\d\s,\.\(\)-]+)*[\w\']*(?=\n)) 
| 
(Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=Assignee:)) 

你有足夠的式樣,在您的正則表達式:

((?s)\(\d\d\)\s+Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=\(\d*\)\s+Assignee:)) 
           ^^^^^^^^^^^^^^^^^^^^ 

((?s)\[\d\d\)\s+Inventor:\s*([\-\w\d\s,\.\(\)-]+)*[\w\']*(?=\n)) 
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 

(Inventor\w*:\s*\w*([\w\d,.\s)(-]+);([\w\s.\',();-]+)(?=Assignee:)) 
       ^^^^^^^^^^^^^^^^^^^^ 

這是不提的是,你包括在你的正則表達式中兩次相同的3個子模式,這是完全多餘的。

+0

我需要多餘的..因爲沒有...字符串不正確; y解析@nhahtdh –

+0

@RageshDAntony:你使用捕獲組的結果?並且在交替中有兩次相同的模式沒有意義(嗯,假設發動機工作正常)。 – nhahtdh

相關問題