我有一個程序使用PDF文檔的正則表達式提取文本。爲什麼這個正則表達式匹配塊需要很長時間才能完成?
我的問題是匹配塊需要較長時間對於某些PDF文件執行...
這是代碼:
String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)";
String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))";
String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))";
String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)";
String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))";
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)";
String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)";
String region = stripper.getTextForRegion("class1");
String regiont = stripper.getTextForRegion("class2");
Pattern p = Pattern.compile(in);
Matcher m = p.matcher(region);
Pattern p2 = Pattern.compile(as);
Matcher m2 = p2.matcher(region);
Pattern p3 = Pattern.compile(title);
Matcher m3 = p3.matcher(region);
Pattern p4 = Pattern.compile(pat_no);
Matcher m4 = p4.matcher(regiont);
Pattern p5 = Pattern.compile(app_no);
Matcher m5 = p5.matcher(region);
Pattern p6 = Pattern.compile(filed);
Matcher m6 = p6.matcher(region);
Pattern p7 = Pattern.compile(pat_dt);
Matcher m7 = p7.matcher(regiont);
long TIMEOUT = 15000l; // 15 seconds
long now = System.currentTimeMillis(); // init the long just above the while
System.out.println("find start");
while(m.find())
{
// System.out.println(m.group());
}
Long nowtime = System.currentTimeMillis() ;
while(m2.find())
{
// System.out.println(m2.group());
}
while(m3.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
// System.out.println(m3.group());
patit = m3.group().replace("(54)", " ");
patit = patit.trim();
// System.out.println("m3");
}
while(m4.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
// System.out.println(m4.group());
patno = m4.group().replace("Patent No.: ", " ");
patno = patno.replace("Patent No: ", " ");
patno = patno.replace("Patent", " ");
patno = patno.replace("No.:", " ");
patno = patno.replace("No:", " ");
patno = patno.replace("Number: ", " ");
patno = patno.replace("Number.: ", " ");
patno = patno.trim();
// System.out.println("m4");
}
while(m5.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
// System.out.println(m5.group());
appno = m5.group().replace("(21)", " ");
appno = appno.replace("Appl. No.: ", " ");
appno = appno.replace("Appl.", " ");
appno = appno.replace("No.", " ");
appno = appno.replace(":"," ");
appno = appno.trim();
// System.out.println("m5");
}
while(m6.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
// System.out.println(m6.group());
patfilled = m6.group().replace("(22)", " ");
patfilled = patfilled.replace("Filed", " ");
patfilled= patfilled.replace("PCT", " ");
patfilled = patfilled.replace(":", " ");
patfilled = patfilled.replace("\n", "");
patfilled= patfilled.trim();
// System.out.println("m6");
}
while (m7.find() && (System.currentTimeMillis() - now) < TIMEOUT)
{
patdate = m7.group().replace("(45) Date of Patent: ", " ");
patdate = patdate.replace("(45) Date of Patent.: ", " ");
patdate = patdate.replace("(45)", " ");
patdate = patdate.replace("Date", " ");
patdate = patdate.replace("of", " ");
patdate = patdate.replace("Patent.: ", " ");
patdate = patdate.replace("Patent: ", " ");
patdate = patdate.replace("Reissued", " ");
patdate = patdate.replace(":", " ");
patdate = patdate.replace("Patent", " ");
patdate = patdate.replace("*", " ");
patdate = patdate.trim();
// System.out.println("m7");
}
System.out.println("find end");
在上面的代碼中,mX.find()需要很長執行某些文件迭代的時間......這就是執行在一些迭代處凍結在System.out.println("find start");
。
這是輸出示例:(滾動看到)
-------
find start
1ms Elasped
1841
File name:06377334.pdf
US 6,377,334 B2
METHOD FOR CONTROLLING IMAGE
SIZE OF INTEGRATED CIRCUITS ON
WAFERS SUPPORTED ON HOT PLATES
DURING POST EXPOSURE BAKING OF THE
WAFERS
Apr. 23, 2002
Jan. 24, 2001
Related U.S. Application Data
09/768,973
-------
find start
1ms Elasped
1842
File name:06377337.pdf
US 6,377,337 B1
PROJECTION EXPOSURE APPARATUS
Apr. 23, 2002
Apr. 27, 1999
09/299,558
-------
find start
1843
File name:06377338.pdf
US 6,377,338 B1
EXPOSURE APPARATUS AND METHOD
Apr. 23, 2002
Oct. 13, 2000
Related U.S. Application Data
09/299,558
-------
find start
1844
File name:06377339.pdf
US 6,377,339 B1
DOCUMENT IMAGING SYSTEM
INCORPORATING A SELECTIVELY
OPAQUE
Apr. 23, 2002
Mar. 29, 1999
09/280,186
-------
find start
1845
File name:06377340.pdf
US 6,377,340 B1
METHOD OF DETECTION OF NATURAL
DIAMONDS THAT HAVE BEEN PROCESSED
AT HIGH PRESSURE AND HIGH
TEMPERATURES
Apr. 23, 2002
Oct. 29, 1999
09/430,477
-------
find start
1846
File name:06377341.pdf
US 6,377,341 B1
REFRACTIVE INDEX BASED DETECTOR
SYSTEM FOR LIQUID CHROMATOGRAPHY
Apr. 23, 2002
Aug. 3, 1999
09/368,310
-------
find start
(execution freezes here)
爲什麼出現這種情況?爲什麼正則表達式匹配器需要很長時間?
在這裏,整個程序:
import java.awt.Rectangle;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class PatentAdder {
/**
* @param args
*/
public static String patno,patit,patdate,patfilled,appno;
private static int File;
public static void main(String[] args) {
try {
int cnt=0;
if(args.length == 1)
{
// usage();
}
else
{
PDDocument document = null;
try
{
File dataDir = new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs");
File[] files = dataDir.listFiles();
int count=0;
long TIMEOUT1 = 60000l; // 15 seconds
long now1 = System.currentTimeMillis();
for (File file : files) {
try {
// System.out.println ("Satrt2");
File f = file;
if (!f.isDirectory()) {
document = PDDocument.load(f.getAbsolutePath());
if(document.isEncrypted())
{
try
{
document.decrypt("");
}
catch(InvalidPasswordException e)
{
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
} }
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
Rectangle rectt = new Rectangle(288, 60, 222, 40);
Rectangle rect = new Rectangle(55, 108, 230, 600); // US-Patent title h40
stripper.addRegion("class1", rect);
stripper.addRegion("class2", rectt);
List allPages = document.getDocumentCatalog().getAllPages();
PDPage firstPage = (PDPage)allPages.get(0);
stripper.extractRegions(firstPage);
String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)";
String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))";
String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))";
String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)";
String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))";
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)";
String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)";
String region = stripper.getTextForRegion("class1");
String regiont = stripper.getTextForRegion("class2");
Pattern p = Pattern.compile(in);
Matcher m = p.matcher(region);
Pattern p2 = Pattern.compile(as);
Matcher m2 = p2.matcher(region);
Pattern p3 = Pattern.compile(title);
Matcher m3 = p3.matcher(region);
Pattern p4 = Pattern.compile(pat_no);
Matcher m4 = p4.matcher(regiont);
Pattern p5 = Pattern.compile(app_no);
Matcher m5 = p5.matcher(region);
Pattern p6 = Pattern.compile(filed);
Matcher m6 = p6.matcher(region);
Pattern p7 = Pattern.compile(pat_dt);
Matcher m7 = p7.matcher(regiont);
System.out.println("find start");
Long nowtime = System.currentTimeMillis() ;
while(m3.find())
{
patit = m3.group().replace("(54)", " ");
patit = patit.trim();
}
while(m4.find())
{
patno = m4.group().replace("Patent No.: ", " ");
patno = patno.replace("Patent No: ", " ");
patno = patno.replace("Patent", " ");
patno = patno.replace("No.:", " ");
patno = patno.replace("No:", " ");
patno = patno.replace("Number: ", " ");
patno = patno.replace("Number.: ", " ");
patno = patno.trim();
}
while(m5.find())
{
appno = m5.group().replace("(21)", " ");
appno = appno.replace("Appl. No.: ", " ");
appno = appno.replace("Appl.", " ");
appno = appno.replace("No.", " ");
appno = appno.replace(":"," ");
appno = appno.trim();
}
while(m6.find())
{
patfilled = m6.group().replace("(22)", " ");
patfilled = patfilled.replace("Filed", " ");
patfilled= patfilled.replace("PCT", " ");
patfilled = patfilled.replace(":", " ");
patfilled = patfilled.replace("\n", "");
patfilled= patfilled.trim();
}
while (m7.find())
{
patdate = m7.group().replace("(45) Date of Patent: ", " ");
patdate = patdate.replace("(45) Date of Patent.: ", " ");
patdate = patdate.replace("(45)", " ");
patdate = patdate.replace("Date", " ");
patdate = patdate.replace("of", " ");
patdate = patdate.replace("Patent.: ", " ");
patdate = patdate.replace("Patent: ", " ");
patdate = patdate.replace("Reissued", " ");
patdate = patdate.replace(":", " ");
patdate = patdate.replace("Patent", " ");
patdate = patdate.replace("*", " ");
patdate = patdate.trim();
}
PrintWriter out = new PrintWriter (new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs/output.txt"));
System.out.println(count);
out.println(count);
System.out.println("File name:"+f.getName());
out.println("File name:"+f.getName());
System.out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------");
out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------");
Long endtime = System.currentTimeMillis()-nowtime;
System.out.println(endtime+"ms Elasped") ;
out.println(endtime+"ms Elasped") ;
count++;
}
catch (IOException e)
{
continue;
}
}
System.out.print("-----Finised "+count+" Files------ \n");
}
finally
{
if(document != null)
{
document.close();
}
}
}
}
catch (Exception e)
{
System.out.println(e.getStackTrace());
//System.out.println(e.getLocalizedMessage());
System.out.println(e.getMessage());
System.out.println(e.getCause());
//System.out.println(e.getClass());
e.printStackTrace();
}
}
請告訴如何optamize正則表達式和解決這個執行凍結問題...
我很抱歉,但我不認爲我們可以提供幫助。它可以是任何數量的東西。也許目錄中有一個損壞的PDF文件或類似的東西。也許在PDF中的特定頁面上有些不同。有些情況下,我們不會有任何幫助。你有沒有看過這個凍結的文件,圍繞1847年的記錄看看有什麼不同?你有沒有試過一個條件調試器,當你到達那條記錄時會觸發,這樣你就可以遍歷代碼並查看可能發生了什麼? –